# CS 155 Miniproject 1
# Philip Carr
# Model: Random Forest Regressor (from sklearn)

In [1]:
import matplotlib.pyplot as plt
import numpy as np

from sklearn.metrics import roc_auc_score
from sklearn.datasets import make_classification

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score

def load_data(filename, skiprows = 1):
    """
    Function loads data stored in the file filename and returns it as a numpy ndarray.
    
    Inputs:
        filename: given as a string.
        
    Outputs:
        Data contained in the file, returned as a numpy ndarray
    """
    return np.loadtxt(filename, skiprows=skiprows, delimiter=',')

## Training Data Processing

Load the data and divide it into training and validation sets:

In [2]:
# Load the training data.
X = load_data("train_2008.csv")
N = len(X)

data = X[:, 3:-1]
label = X[:, -1]

train_percent = 70.
train_size = int(N * train_percent / 100)

# Randomly split the training data into training
# and validation sets.
random_order = np.random.permutation(np.arange(N))

x_train = data[random_order[0:train_size]]
y_train = label[random_order[0:train_size]]
x_validation = data[random_order[train_size:]]
y_validation = label[random_order[train_size:]]

In [3]:
print(np.shape(X))

(64667, 383)


Normalize the Data

In [4]:
print(x_train)

[[  1. 201.   0. ...   0.   0.   0.]
 [  1. 201.   0. ...   0.   0.   0.]
 [  1. 201.   0. ...   0.   0.   0.]
 ...
 [  1. 201.   0. ...   0.   0.   0.]
 [  1. 201.   0. ...   0.   0.   0.]
 [  1. 201.   0. ...   0.   0.   0.]]


In [5]:
y_train

array([1., 0., 0., ..., 1., 0., 0.])

In [6]:
# Normalizing the data.
train_mean_array = np.zeros(len(x_train[0]))
train_std_array = np.zeros(len(x_train[0]))
std_nonzero_indices = []
for j in range(len(x_train[0])):
    train_mean_array[j] = np.mean(x_train[:,j])
    train_std_array[j] = np.std(x_train[:,j])
    if train_std_array[j] != 0:
        std_nonzero_indices.append(j)
        x_train[:,j] = \
            np.divide(x_train[:,j] - train_mean_array[j],
                      train_std_array[j])
    if np.std(x_validation[:,j]) != 0:
        x_validation[:,j] = \
            np.divide(x_validation[:,j] - np.mean(x_validation[:,j]),
                      np.std(x_validation[:,j]))

In [7]:
# Remove features from the data that have standard
# deviation of 0 in the training set.
x_train = x_train[:, std_nonzero_indices]
x_validation = x_validation[:, std_nonzero_indices]

In [8]:
print(x_train)

[[-0.61635698  0.3836888  -0.07875145 ... -0.10382691 -0.10360927
  -0.10469274]
 [-0.61635698  0.3836888  -0.07875145 ... -0.10382691 -0.10360927
  -0.10469274]
 [-0.61635698  0.3836888  -0.07875145 ... -0.10382691 -0.10360927
  -0.10469274]
 ...
 [-0.61635698  0.3836888  -0.07875145 ... -0.10382691 -0.10360927
  -0.10469274]
 [-0.61635698  0.3836888  -0.07875145 ... -0.10382691 -0.10360927
  -0.10469274]
 [-0.61635698  0.3836888  -0.07875145 ... -0.10382691 -0.10360927
  -0.10469274]]


In [9]:
print(y_train)

[1. 0. 0. ... 1. 0. 0.]


# Random Forest Regressor Initialization and Fitting

In [10]:
# rf = RandomForestRegressor(n_estimators=100, n_jobs=-1)
rf = RandomForestRegressor(n_estimators=1000, n_jobs=-1, max_features=20,
                           max_depth=15)

In [11]:
fit = rf.fit(x_train, y_train)

In [12]:
y_output_train = rf.predict(x_train)

In [13]:
y_output_train

array([0.15768492, 0.3316633 , 0.08460946, ..., 0.27886429, 0.34164051,
       0.0557242 ])

In [14]:
# Printing the accuracy of the model.
train_score = rf.score(x_train, y_train)
print('Train score:', train_score)
train_auc = roc_auc_score(y_train, y_output_train)
print('Train auc:', train_auc)

Train score: 0.4630632650087274
Train auc: 0.9358616702522273


In [15]:
y_train

array([1., 0., 0., ..., 1., 0., 0.])

In [16]:
y_output_lines = []
for i in range(len(y_output_train)):
    y_output_lines.append([i, y_output_train[i], y_train[i]])
np.savetxt("2008_train_output.csv", y_output_lines, fmt='%d,%f,%f')

## Validation Results

In [17]:
## Printing the accuracy of the model, according to the loss function specified in model.compile above.
validation_score = rf.score(x_validation, y_validation)
print('Validation score:', validation_score)
y_output_validation = rf.predict(x_validation)
validation_auc = roc_auc_score(y_validation, y_output_validation)
print('Validation auc:', validation_auc)

Validation score: 0.18074632848908068
Validation auc: 0.7732541423466235


In [18]:
y_output_lines = []
for i in range(len(y_output_validation)):
    y_output_lines.append([i, y_output_validation[i], y_validation[i]])
np.savetxt("2008_validation_output.csv", y_output_lines, fmt='%d,%f,%f')

In [19]:
np.mean(cross_val_score(rf, data, label, cv=2, scoring="roc_auc"))

0.7723558754239088

In [20]:
np.mean(cross_val_score(rf, data, label, cv=3, scoring="roc_auc"))

0.7745233790616298

In [21]:
np.mean(cross_val_score(rf, data, label, cv=4, scoring="roc_auc"))

0.7759490259301046

# Test Output 2008

In [22]:
# Load the 2008 test data.
X_test = load_data("test_2008.csv")
ids = X_test[:,0]
x_test = X_test[:, 3:]

In [23]:
# Normalizing the data.
for j in range(len(x_test[0])):
    test_std = np.std(x_test[:,j])
    if test_std != 0:
        x_test[:,j] = \
            np.divide(x_test[:,j] - np.mean(x_test[:,j]),
                      np.std(x_test[:,j]))

In [24]:
# Remove features from the data that have standard
# deviation of 0 in the training set.
x_test = x_test[:, std_nonzero_indices]

In [25]:
y_output = rf.predict(x_test)

In [26]:
y_output_lines2 = []
for i in range(len(y_output)):
    y_output_lines2.append([i, y_output[i]])
np.savetxt("2008_submission.csv", y_output_lines2, fmt='%d,%f')

# Test Output 2012

In [27]:
# Load the 2012 test data.
X_test2 = load_data("test_2012.csv")
ids2 = X_test2[:,0]
x_test2 = X_test2[:, 3:]

In [28]:
# Normalizing the data.
for j in range(len(x_test2[0])):
    test_std = np.std(x_test2[:,j])
    if test_std != 0:
        x_test2[:,j] = \
            np.divide(x_test2[:,j] - np.mean(x_test2[:,j]),
                      np.std(x_test2[:,j]))

In [29]:
# Remove features from the data that have standard
# deviation of 0 in the training set.
x_test2 = x_test2[:, std_nonzero_indices]

In [30]:
y_output2 = rf.predict(x_test2)

In [31]:
y_output_lines3 = []
for i in range(len(y_output2)):
    y_output_lines3.append([i, y_output2[i]])
np.savetxt("2012_submission.csv", y_output_lines3, fmt='%d,%f')