# CS 155 Final Exam Kaggle Competition
# Philip Carr
# Model: Random Forest Regressor (from sklearn)

In [1]:
import matplotlib.pyplot as plt
import numpy as np

from sklearn.metrics import roc_auc_score
from sklearn.datasets import make_classification

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score

from sklearn.metrics import mean_squared_error

def load_data(filename, skiprows = 1):
    """
    Function loads data stored in the file filename and returns it as a numpy ndarray.
    
    Inputs:
        filename: given as a string.
        
    Outputs:
        Data contained in the file, returned as a numpy ndarray
    """
    return np.loadtxt(filename, skiprows=skiprows, delimiter=',')

  from numpy.core.umath_tests import inner1d


## Training Data Processing

Load the data and divide it into training and validation sets:

In [2]:
# Load the training data.
X = load_data("X_train.csv")
N = len(X)

data = X[:, 3:-1]

Y = load_data("y_train.csv")
label = Y

train_percent = 70.
train_size = int(N * train_percent / 100)

# Randomly split the training data into training
# and validation sets.
random_order = np.random.permutation(np.arange(N))

x_train = data[random_order[0:train_size]]
y_train = label[random_order[0:train_size]]
x_validation = data[random_order[train_size:]]
y_validation = label[random_order[train_size:]]

In [3]:
print(data)
print(label)

[[ 6.000e+00  1.200e+01  1.900e+01 ...  1.441e+03 -1.000e+00  0.000e+00]
 [ 6.000e+00  1.200e+01  2.500e+01 ...  1.350e+02 -1.000e+00  0.000e+00]
 [ 6.000e+00  1.200e+01  4.200e+01 ...  1.520e+03 -1.000e+00  0.000e+00]
 ...
 [ 3.000e+00  1.600e+01  3.900e+01 ... -1.000e+00 -1.000e+00  1.000e+00]
 [ 3.000e+00  1.600e+01  4.000e+01 ...  3.900e+01 -1.000e+00  0.000e+00]
 [ 3.000e+00  1.600e+01  4.000e+01 ...  5.620e+02 -1.000e+00  0.000e+00]]
[ 0. 18.  3. ...  3.  0.  3.]


In [4]:
print(np.shape(X))

(56490, 26)


Normalize the Data

In [5]:
print(x_train)

[[  0.   9.  32. ...  -1.  -1.   0.]
 [  0.  15.  40. ... 538.  -1.   0.]
 [  2.  15.  38. ...  -1.  -1.   0.]
 ...
 [  0.  11.  18. ... 132.  -1.   0.]
 [  3.   9.  56. ...  -1.  -1.   0.]
 [  4.   9.  50. ...  45.  -1.   0.]]


In [6]:
y_train

array([ 77.,  58., 105., ...,  47.,   6.,   3.])

In [7]:
# Normalizing the data.
train_mean_array = np.zeros(len(x_train[0]))
train_std_array = np.zeros(len(x_train[0]))
std_nonzero_indices = []
for j in range(len(x_train[0])):
    train_mean_array[j] = np.mean(x_train[:,j])
    train_std_array[j] = np.std(x_train[:,j])
    if train_std_array[j] != 0:
        std_nonzero_indices.append(j)
        x_train[:,j] = \
            np.divide(x_train[:,j] - train_mean_array[j],
                      train_std_array[j])
    if np.std(x_validation[:,j]) != 0:
        x_validation[:,j] = \
            np.divide(x_validation[:,j] - np.mean(x_validation[:,j]),
                      np.std(x_validation[:,j]))

In [8]:
# Remove features from the data that have standard
# deviation of 0 in the training set.
print(std_nonzero_indices)
# x_train = x_train[:, std_nonzero_indices]
# x_validation = x_validation[:, std_nonzero_indices]

[0, 1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21]


In [9]:
good_features = [1, 2, 3, 5, 6]
x_train = x_train[:, good_features]
x_validation = x_validation[:, good_features]

In [10]:
print(x_train)

[[-1.21786677  0.18663729  1.          0.72407523 -0.4209398 ]
 [ 1.03393253  0.63612865  1.          0.72407523 -0.4209398 ]
 [ 1.03393253  0.52375581  1.          0.72407523 -0.4209398 ]
 ...
 [-0.467267   -0.59997257  1.         -1.38107196 -0.4209398 ]
 [-1.21786677  1.53511135  1.          0.72407523 -0.4209398 ]
 [-1.21786677  1.19799284  1.         -1.38107196 -0.4209398 ]]


In [11]:
print(y_train)

[ 77.  58. 105. ...  47.   6.   3.]


# Random Forest Regressor Initialization and Fitting

In [14]:
# rf = RandomForestRegressor(n_estimators=100, n_jobs=-1)
rf = RandomForestRegressor(n_estimators=1000, n_jobs=-1, max_features=4,
                           max_depth=10)

In [15]:
fit = rf.fit(x_train, y_train)

In [16]:
y_output_train = rf.predict(x_train)

In [17]:
y_output_train

array([ 93.89532727,  94.77304028,  90.41630687, ...,  19.02957819,
       105.09677497,  19.23597147])

In [18]:
# Printing the accuracy of the model.
train_score = rf.score(x_train, y_train)
print('Train score:', train_score)
# train_auc = roc_auc_score(y_train, y_output_train)
# print('Train auc:', train_auc)
train_mse = mean_squared_error(y_train, y_output_train)
print('Train mse:', train_mse)

Train score: 0.4621174041284911
Train mse: 2610.165523031234


In [19]:
y_train

array([ 77.,  58., 105., ...,  47.,   6.,   3.])

In [20]:
y_output_lines = []
for i in range(len(y_output_train)):
    y_output_lines.append([i, y_output_train[i], y_train[i]])
np.savetxt("train_output.csv", y_output_lines, fmt='%d,%f,%f')

## Validation Results

In [21]:
## Printing the accuracy of the model, according to the loss function specified in model.compile above.
validation_score = rf.score(x_validation, y_validation)
print('Validation score:', validation_score)
y_output_validation = rf.predict(x_validation)
# validation_auc = roc_auc_score(y_validation, y_output_validation)
# print('Validation auc:', validation_auc)
validation_mse = mean_squared_error(y_validation, y_output_validation)
print('Validation mse:', validation_mse)

Validation score: 0.44234682928989233
Validation mse: 2637.114138814914


In [22]:
y_output_lines = []
for i in range(len(y_output_validation)):
    y_output_lines.append([i, y_output_validation[i], y_validation[i]])
# np.savetxt("2008_validation_output.csv", y_output_lines, fmt='%d,%f,%f')
np.savetxt("validation_output.csv", y_output_lines, fmt='%d,%f,%f')

In [23]:
# np.mean(cross_val_score(rf, data, label, cv=2, scoring="roc_auc"))
np.mean(cross_val_score(rf, data, label, cv=2,
                        scoring="neg_mean_squared_error"))

-2282.245656189637

In [24]:
# np.mean(cross_val_score(rf, data, label, cv=3, scoring="roc_auc"))
np.mean(cross_val_score(rf, data, label, cv=3,
                        scoring="neg_mean_squared_error"))

-2220.398004827284

In [25]:
# np.mean(cross_val_score(rf, data, label, cv=4, scoring="roc_auc"))
np.mean(cross_val_score(rf, data, label, cv=4,
                        scoring="neg_mean_squared_error"))

-2041.474312603059

# Test Output

In [None]:
# Load the 2008 test data.
X_test = load_data("X_test.csv")
ids = np.arange(len(X_test))
x_test = X_test[:, 3:]

In [None]:
# Normalizing the data.
for j in range(len(x_test[0])):
    test_std = np.std(x_test[:,j])
    if test_std != 0:
        x_test[:,j] = \
            np.divide(x_test[:,j] - np.mean(x_test[:,j]),
                      np.std(x_test[:,j]))

In [None]:
# Remove features from the data that have standard
# deviation of 0 in the training set.
x_test = x_test[:, std_nonzero_indices]

In [None]:
y_output = rf.predict(x_test)

In [None]:
y_output_lines2 = []
for i in range(len(y_output)):
    y_output_lines2.append([i, y_output[i]])
np.savetxt("submission.csv", y_output_lines2, fmt='%d,%f')

# Test Output 2012

In [None]:
# # Load the 2012 test data.
# X_test2 = load_data("test_2012.csv")
# ids2 = X_test2[:,0]
# x_test2 = X_test2[:, 3:]

In [None]:
# # Normalizing the data.
# for j in range(len(x_test2[0])):
#     test_std = np.std(x_test2[:,j])
#     if test_std != 0:
#         x_test2[:,j] = \
#             np.divide(x_test2[:,j] - np.mean(x_test2[:,j]),
#                       np.std(x_test2[:,j]))

In [None]:
# # Remove features from the data that have standard
# # deviation of 0 in the training set.
# x_test2 = x_test2[:, std_nonzero_indices]

In [None]:
# y_output2 = rf.predict(x_test2)

In [None]:
# y_output_lines3 = []
# for i in range(len(y_output2)):
#     y_output_lines3.append([i, y_output2[i]])
# np.savetxt("2012_submission.csv", y_output_lines3, fmt='%d,%f')