# CS 155 Final Exam Kaggle Competition
# Philip Carr
# Model: XGBoost (from sklearn)

In [1]:
import matplotlib.pyplot as plt
import numpy as np

from sklearn.metrics import roc_auc_score
from sklearn.datasets import make_classification

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score

from sklearn.metrics import mean_squared_error

import xgboost as xgb

def load_data(filename, skiprows = 1):
    """
    Function loads data stored in the file filename and returns it as a numpy ndarray.
    
    Inputs:
        filename: given as a string.
        
    Outputs:
        Data contained in the file, returned as a numpy ndarray
    """
    return np.loadtxt(filename, skiprows=skiprows, delimiter=',')

  from numpy.core.umath_tests import inner1d


## Training Data Processing

Load the data and divide it into training and validation sets:

In [2]:
# Load the training data.
X = load_data("X_train.csv")
N = len(X)

data = X[:, 3:]

Y = load_data("y_train.csv")
label = Y

train_percent = 70.
train_size = int(N * train_percent / 100)

# Randomly split the training data into training
# and validation sets.
random_order = np.random.permutation(np.arange(N))

x_train = data[random_order[0:train_size]]
y_train = label[random_order[0:train_size]]
x_validation = data[random_order[train_size:]]
y_validation = label[random_order[train_size:]]

In [3]:
print(data)
print(label)

[[ 6. 12. 19. ... -1.  0.  1.]
 [ 6. 12. 25. ... -1.  0.  1.]
 [ 6. 12. 42. ... -1.  0.  1.]
 ...
 [ 3. 16. 39. ... -1.  1.  1.]
 [ 3. 16. 40. ... -1.  0. 67.]
 [ 3. 16. 40. ... -1.  0. 53.]]
[ 0. 18.  3. ...  3.  0.  3.]


In [4]:
print(np.shape(X))

(56490, 26)


Dealing with Missing Values

In [5]:
print(x_train)

[[ 5.  9. 25. ... -1.  0.  5.]
 [ 2. 15. 48. ... -1.  0.  4.]
 [ 2. 14.  6. ... -1.  0.  3.]
 ...
 [ 0. 13. 40. ... -1.  0. -1.]
 [ 1. 15. 32. ... -1.  0. -1.]
 [ 0. 11. 38. ... -1.  0. 61.]]


In [6]:
print(np.shape(x_train))

(39543, 23)


In [7]:
y_train

array([160.,  89.,  73., ...,   1.,  35.,  59.])

In [None]:
for j in range(len(x_train[0])):
    col_median = np.median()

Normalize the Data

In [8]:
# Normalizing the data.
train_mean_array = np.zeros(len(x_train[0]))
train_std_array = np.zeros(len(x_train[0]))
std_nonzero_indices = []
for j in range(len(x_train[0])):
    train_mean_array[j] = np.mean(x_train[:,j])
    train_std_array[j] = np.std(x_train[:,j])
    if train_std_array[j] != 0:
        std_nonzero_indices.append(j)
        x_train[:,j] = \
            np.divide(x_train[:,j] - train_mean_array[j],
                      train_std_array[j])
    if np.std(x_validation[:,j]) != 0:
        x_validation[:,j] = \
            np.divide(x_validation[:,j] - np.mean(x_validation[:,j]),
                      np.std(x_validation[:,j]))

In [9]:
# Remove features from the data that have standard
# deviation of 0 in the training set.
print(std_nonzero_indices)
x_train = x_train[:, std_nonzero_indices]
x_validation = x_validation[:, std_nonzero_indices]

[0, 1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22]


In [10]:
# good_features = [1, 2, 3, 4, 5, 9, 10, 14, 15, 19, 21, 22]
# x_train = x_train[:, good_features]
# x_validation = x_validation[:, good_features]

In [11]:
print(x_train)

[[ 1.6273471  -1.22384979 -0.20872159 ... -0.02198608 -0.20236856
  -0.67953191]
 [-0.19836275  1.03570147  1.0823754  ... -0.02198608 -0.20236856
  -0.72677732]
 [-0.19836275  0.65910959 -1.27527998 ... -0.02198608 -0.20236856
  -0.77402272]
 ...
 [-1.41550265  0.28251772  0.63329819 ... -0.02198608 -0.20236856
  -0.96300435]
 [-0.8069327   1.03570147  0.18422097 ... -0.02198608 -0.20236856
  -0.96300435]
 [-1.41550265 -0.47066604  0.52102888 ... -0.02198608 -0.20236856
   1.96621084]]


In [12]:
print(np.shape(x_train))

(39543, 22)


In [13]:
print(y_train)

[160.  89.  73. ...   1.  35.  59.]


In [14]:
dtrain = xgb.DMatrix(x_train, label=y_train)
dvalidation = xgb.DMatrix(x_validation, label=y_validation)
#dtest = xgb.DMatrix(x_test)
# specify parameters via map
param = {'max_depth':3, 'eta':0.5, 'silent':1,
         'objective':'reg:linear', 'eval_metric':'rmse',
         'scale_pos_rate':1, 'n_estimators':1000}
num_round = 3

In [15]:
bst = None
for num_r in range(2, 21):
    #print(num_r)
    #print(xgb.cv(param, dtrain, num_r, 5))
    
    #print(xgb.cv(param, dtrain, num_round, 5))
    print()
    print(num_r)
    bst = xgb.train(param, dtrain, num_boost_round=num_r)
    # make prediction
    
    preds = bst.predict(dtrain)
    print(mean_squared_error(y_train, preds))
    
    preds = bst.predict(dvalidation)
    print(mean_squared_error(y_validation, preds))
    
    #preds = bst.predict(dtest)


2
2801.2797210275266
3113.683099480169

3
2290.552143654139
2446.6295091422026

4
2071.506395660251
2310.7938415165804

5
1948.402039235269
2414.8534090261355

6
1878.458282969424
2304.5774111912797

7
1813.1592343120533
2266.5648544035594

8
1770.272178824099
2214.853416479521

9
1730.5120765461063
2177.5305099942652

10
1698.2125110038996
2144.3644022437143

11
1656.6770024732573
2093.5711236412208

12
1615.2495471766024
2035.742416739422

13
1591.7931276946997
2028.0048191025624

14
1578.7023912181457
2013.6656667206516

15
1562.5184751633585
1997.1087100609338

16
1520.6938388428503
1953.0914945697884

17
1513.2681935337482
1952.2252170475224

18
1506.5729688783401
1932.701605656143

19
1499.4603609623664
1920.884044643908

20
1492.4060626542732
1914.957990365629


In [16]:
print(bst)

<xgboost.core.Booster object at 0x00000182D35E25F8>


# XGBoost Initialization and Fitting

In [17]:
from xgboost.sklearn import XGBRegressor
from sklearn.model_selection import GridSearchCV

xgb1 = XGBRegressor(max_depth=6, learning_rate=0.1, n_estimators=300,
                    silent=True, objective='reg:linear',
                    booster='gbtree', n_jobs=4, nthread=None,
                    gamma=0, min_child_weight=6, max_delta_step=0,
                    subsample=0.8, colsample_bytree=0.8,
                    colsample_bylevel=1, reg_alpha=0.20, 
                    reg_lambda=1, scale_pos_weight=1, 
                    base_score=0.5, random_state=0, 
                    seed=None, missing=None, importance_type='gain')
xgb1.fit(x_train, y_train, eval_metric='rmse')

# param_test1 = {
#  'max_depth':range(4,7),
#  'min_child_weight':range(4,7)
# }  # result: max_depth = 5, max_child_weight = 6 (old)
# param_test2 = {
#  'n_estimators':range(150, 310, 25)
# } # result: n_estimators = 200 (old)
# param_test3 = {
#     'max_depth':range(4,7),
#     'min_child_weight':range(4,7),
#     'n_estimators':range(150, 310, 25)
# } # result: max_depth = 6, max_child_weight = 4, n_estimators = 300
# param_test4 = {
#     "objective":["reg:linear"],
#     'learning_rate':[0.1, 0.01],
#     'booster':["gbtree", "gblinear", "dart"],
#     "gamma":[0, 1, 0.1, 0.01],
#     "max_delta_step":[0, 1]
# } # result: {'booster': 'dart', 'gamma': 1, 'learning_rate': 0.1, 
# # 'max_delta_step': 0, 'objective': 'reg:linear'}
# param_test5= {
#     "lambda":[0, 1, 10, 20, 100],
#     "alpha":[0, 1, 10, 20, 100],
#     "tree_method":["auto"]
# }
# param_test6= {
#     "subsample":[0.6, 0.8, 1],
#     "colsample_bytree":[0.6, 0.8, 1],
#     "colsample_bylevel":[0.6, 0.8, 1],
#     "colsample_bynode":[0.6, 0.8, 1]
# }
# gsearch1 = GridSearchCV(
#     estimator = XGBRegressor( learning_rate =0.1, n_estimators=300,
#                              max_depth=6, min_child_weight=4, gamma=1,
#                              subsample=0.8, colsample_bytree=0.8,
#                              objective= 'reg:linear', nthread=4,
#                              booster="dart", max_delta_step=0,
#                              scale_pos_weight=1, seed=27),
#     param_grid = param_test4, scoring='neg_mean_squared_error',
#     n_jobs=4,iid=False, cv=3, verbose=999)
# gsearch1.fit(x_train, y_train)
# print(gsearch1.best_params_, gsearch1.best_score_)

Fitting 3 folds for each of 48 candidates, totalling 144 fits
Memmaping (shape=(39543, 22), dtype=float64) to new file C:\Users\Phil\AppData\Local\Temp\joblib_memmaping_pool_38152_1661468380128\38152-1661466439632-ab7a1add8b9333d3650a249730f8c222.pkl
Pickling array (shape=(39543,), dtype=float64).
Pickling array (shape=(26362,), dtype=int32).
Pickling array (shape=(13181,), dtype=int32).
Memmaping (shape=(39543, 22), dtype=float64) to old file C:\Users\Phil\AppData\Local\Temp\joblib_memmaping_pool_38152_1661468380128\38152-1661466439632-ab7a1add8b9333d3650a249730f8c222.pkl
Pickling array (shape=(39543,), dtype=float64).
Pickling array (shape=(26362,), dtype=int32).
Pickling array (shape=(13181,), dtype=int32).
Memmaping (shape=(39543, 22), dtype=float64) to old file C:\Users\Phil\AppData\Local\Temp\joblib_memmaping_pool_38152_1661468380128\38152-1661466439632-ab7a1add8b9333d3650a249730f8c222.pkl
Pickling array (shape=(39543,), dtype=float64).
Pickling array (shape=(26362,), dtype=int32

[Parallel(n_jobs=4)]: Done  18 tasks      | elapsed:  1.6min
Memmaping (shape=(39543, 22), dtype=float64) to old file C:\Users\Phil\AppData\Local\Temp\joblib_memmaping_pool_38152_1661468380128\38152-1661466439632-ab7a1add8b9333d3650a249730f8c222.pkl
Pickling array (shape=(39543,), dtype=float64).
Pickling array (shape=(26362,), dtype=int32).
Pickling array (shape=(13181,), dtype=int32).
[Parallel(n_jobs=4)]: Done  19 tasks      | elapsed:  1.7min
Memmaping (shape=(39543, 22), dtype=float64) to old file C:\Users\Phil\AppData\Local\Temp\joblib_memmaping_pool_38152_1661468380128\38152-1661466439632-ab7a1add8b9333d3650a249730f8c222.pkl
Pickling array (shape=(39543,), dtype=float64).
Pickling array (shape=(26362,), dtype=int32).
Pickling array (shape=(13181,), dtype=int32).
[Parallel(n_jobs=4)]: Done  20 tasks      | elapsed:  1.7min
Memmaping (shape=(39543, 22), dtype=float64) to old file C:\Users\Phil\AppData\Local\Temp\joblib_memmaping_pool_38152_1661468380128\38152-1661466439632-ab7a1ad

[Parallel(n_jobs=4)]: Done  40 tasks      | elapsed:  3.2min
Memmaping (shape=(39543, 22), dtype=float64) to old file C:\Users\Phil\AppData\Local\Temp\joblib_memmaping_pool_38152_1661468380128\38152-1661466439632-ab7a1add8b9333d3650a249730f8c222.pkl
Pickling array (shape=(39543,), dtype=float64).
Pickling array (shape=(26362,), dtype=int32).
Pickling array (shape=(13181,), dtype=int32).
[Parallel(n_jobs=4)]: Done  41 tasks      | elapsed:  3.3min
Memmaping (shape=(39543, 22), dtype=float64) to old file C:\Users\Phil\AppData\Local\Temp\joblib_memmaping_pool_38152_1661468380128\38152-1661466439632-ab7a1add8b9333d3650a249730f8c222.pkl
Pickling array (shape=(39543,), dtype=float64).
Pickling array (shape=(26362,), dtype=int32).
Pickling array (shape=(13181,), dtype=int32).
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:  3.4min
Memmaping (shape=(39543, 22), dtype=float64) to old file C:\Users\Phil\AppData\Local\Temp\joblib_memmaping_pool_38152_1661468380128\38152-1661466439632-ab7a1ad

[Parallel(n_jobs=4)]: Done  62 tasks      | elapsed:  4.2min
Memmaping (shape=(39543, 22), dtype=float64) to old file C:\Users\Phil\AppData\Local\Temp\joblib_memmaping_pool_38152_1661468380128\38152-1661466439632-ab7a1add8b9333d3650a249730f8c222.pkl
Pickling array (shape=(39543,), dtype=float64).
Pickling array (shape=(26362,), dtype=int32).
Pickling array (shape=(13181,), dtype=int32).
[Parallel(n_jobs=4)]: Done  63 tasks      | elapsed:  4.3min
Memmaping (shape=(39543, 22), dtype=float64) to old file C:\Users\Phil\AppData\Local\Temp\joblib_memmaping_pool_38152_1661468380128\38152-1661466439632-ab7a1add8b9333d3650a249730f8c222.pkl
Pickling array (shape=(39543,), dtype=float64).
Pickling array (shape=(26362,), dtype=int32).
Pickling array (shape=(13181,), dtype=int32).
[Parallel(n_jobs=4)]: Done  64 tasks      | elapsed:  4.3min
Memmaping (shape=(39543, 22), dtype=float64) to old file C:\Users\Phil\AppData\Local\Temp\joblib_memmaping_pool_38152_1661468380128\38152-1661466439632-ab7a1ad

[Parallel(n_jobs=4)]: Done  84 tasks      | elapsed:  5.1min
Memmaping (shape=(39543, 22), dtype=float64) to old file C:\Users\Phil\AppData\Local\Temp\joblib_memmaping_pool_38152_1661468380128\38152-1661466439632-ab7a1add8b9333d3650a249730f8c222.pkl
Pickling array (shape=(39543,), dtype=float64).
Pickling array (shape=(26362,), dtype=int32).
Pickling array (shape=(13181,), dtype=int32).
[Parallel(n_jobs=4)]: Done  85 tasks      | elapsed:  5.1min
Memmaping (shape=(39543, 22), dtype=float64) to old file C:\Users\Phil\AppData\Local\Temp\joblib_memmaping_pool_38152_1661468380128\38152-1661466439632-ab7a1add8b9333d3650a249730f8c222.pkl
Pickling array (shape=(39543,), dtype=float64).
Pickling array (shape=(26362,), dtype=int32).
Pickling array (shape=(13181,), dtype=int32).
[Parallel(n_jobs=4)]: Done  86 tasks      | elapsed:  5.1min
Memmaping (shape=(39543, 22), dtype=float64) to old file C:\Users\Phil\AppData\Local\Temp\joblib_memmaping_pool_38152_1661468380128\38152-1661466439632-ab7a1ad

[Parallel(n_jobs=4)]: Done 106 tasks      | elapsed:  9.7min
Memmaping (shape=(39543, 22), dtype=float64) to old file C:\Users\Phil\AppData\Local\Temp\joblib_memmaping_pool_38152_1661468380128\38152-1661466439632-ab7a1add8b9333d3650a249730f8c222.pkl
Pickling array (shape=(39543,), dtype=float64).
Pickling array (shape=(26362,), dtype=int32).
Pickling array (shape=(13181,), dtype=int32).
[Parallel(n_jobs=4)]: Done 107 tasks      | elapsed:  9.7min
Memmaping (shape=(39543, 22), dtype=float64) to old file C:\Users\Phil\AppData\Local\Temp\joblib_memmaping_pool_38152_1661468380128\38152-1661466439632-ab7a1add8b9333d3650a249730f8c222.pkl
Pickling array (shape=(39543,), dtype=float64).
Pickling array (shape=(26362,), dtype=int32).
Pickling array (shape=(13181,), dtype=int32).
[Parallel(n_jobs=4)]: Done 108 tasks      | elapsed:  9.7min
Memmaping (shape=(39543, 22), dtype=float64) to old file C:\Users\Phil\AppData\Local\Temp\joblib_memmaping_pool_38152_1661468380128\38152-1661466439632-ab7a1ad

[Parallel(n_jobs=4)]: Done 128 tasks      | elapsed: 18.0min
Memmaping (shape=(39543, 22), dtype=float64) to old file C:\Users\Phil\AppData\Local\Temp\joblib_memmaping_pool_38152_1661468380128\38152-1661466439632-ab7a1add8b9333d3650a249730f8c222.pkl
Pickling array (shape=(39543,), dtype=float64).
Pickling array (shape=(26362,), dtype=int32).
Pickling array (shape=(13181,), dtype=int32).
[Parallel(n_jobs=4)]: Done 129 tasks      | elapsed: 18.1min
Memmaping (shape=(39543, 22), dtype=float64) to old file C:\Users\Phil\AppData\Local\Temp\joblib_memmaping_pool_38152_1661468380128\38152-1661466439632-ab7a1add8b9333d3650a249730f8c222.pkl
Pickling array (shape=(39543,), dtype=float64).
Pickling array (shape=(26362,), dtype=int32).
Pickling array (shape=(13181,), dtype=int32).
[Parallel(n_jobs=4)]: Done 130 tasks      | elapsed: 18.3min
Memmaping (shape=(39543, 22), dtype=float64) to old file C:\Users\Phil\AppData\Local\Temp\joblib_memmaping_pool_38152_1661468380128\38152-1661466439632-ab7a1ad

In [18]:
for key in gsearch1.cv_results_:
    print(key, gsearch1.cv_results_[key])

mean_fit_time [ 18.00351898  16.07766954  19.6391449   13.14983185  22.88142904
  20.42666006  23.84855533  14.47894573  20.81632996  16.24954263
  19.42006453  13.12091009  18.80404496  15.59595704  19.06368454
  15.01351476   6.18612321   7.94807768   9.02586222   8.1625042
   8.15086929   8.86229912   8.4733394    8.7682174   11.80376617
   9.89221072   9.10930506   8.51323191   8.52652947   8.68477376
   8.56609114   7.9397668  108.41240493  68.61982123  96.18055058
  52.92865364 121.33527915  85.10041499 112.27092314  56.20702521
 116.30879084  75.83120338  93.50925978  44.51428755  93.18612409
  68.5267388  102.26152031  46.93038988]
std_fit_time [ 0.99061121  0.30413969  0.31337189  0.14211495  0.81270689  0.65024566
  0.87499717  0.60724552  0.2249344   0.30425396  0.28525273  0.42311927
  0.35828648  0.22140145  1.16640388  1.43085429  1.05380891  0.35017694
  0.27904727  0.49127876  0.11256554  0.28403013  0.45937333  0.51382283
  0.25483994  0.74842614  0.08123733  0.3251614



In [19]:
y_output_train = xgb1.predict(x_train)

y_output_validation = xgb1.predict(x_validation)

In [20]:
y_output_train = gsearch1.predict(x_train)

y_output_validation = gsearch1.predict(x_validation)

In [21]:
y_output_train

array([140.1199  ,  73.98692 , 114.89906 , ...,   9.024885,   9.171138,
       106.03485 ], dtype=float32)

In [22]:
# Printing the accuracy of the model.
# train_score = rf.score(x_train, y_train)
# print('Train score:', train_score)
# train_auc = roc_auc_score(y_train, y_output_train)
# print('Train auc:', train_auc)
train_mse = mean_squared_error(y_train, y_output_train)
print('Train mse:', train_mse)

Train mse: 838.9916747343972


In [23]:
y_train

array([160.,  89.,  73., ...,   1.,  35.,  59.])

In [24]:
y_output_lines = []
for i in range(len(y_output_train)):
    y_output_lines.append([i, y_output_train[i], y_train[i]])
np.savetxt("train_output.csv", y_output_lines, fmt='%d,%f,%f')

## Validation Results

In [25]:
## Printing the accuracy of the model, according to the loss function specified in model.compile above.
# validation_score = rf.score(x_validation, y_validation)
# print('Validation score:', validation_score)
# y_output_validation = rf.predict(x_validation)
# validation_auc = roc_auc_score(y_validation, y_output_validation)
# print('Validation auc:', validation_auc)
validation_mse = mean_squared_error(y_validation, y_output_validation)
print('Validation mse:', validation_mse)

Validation mse: 2008.7982352870677


In [26]:
y_output_lines = []
for i in range(len(y_output_validation)):
    y_output_lines.append([i, y_output_validation[i], y_validation[i]])
# np.savetxt("2008_validation_output.csv", y_output_lines, fmt='%d,%f,%f')
np.savetxt("validation_output.csv", y_output_lines, fmt='%d,%f,%f')

In [27]:
# np.mean(cross_val_score(rf, data, label, cv=2, scoring="roc_auc"))
# np.mean(cross_val_score(rf, data, label, cv=2,
#                         scoring="neg_mean_squared_error"))

In [28]:
# np.mean(cross_val_score(rf, data, label, cv=3, scoring="roc_auc"))
# np.mean(cross_val_score(rf, data, label, cv=3,
#                         scoring="neg_mean_squared_error"))

In [29]:
# np.mean(cross_val_score(rf, data, label, cv=4, scoring="roc_auc"))
# np.mean(cross_val_score(rf, data, label, cv=4,
#                         scoring="neg_mean_squared_error"))

# Test Output

In [30]:
# Load the 2008 test data.
X_test = load_data("X_test.csv")
ids = np.arange(len(X_test))
x_test = X_test[:, 3:]

In [31]:
# Normalizing the data.
for j in range(len(x_test[0])):
    test_std = np.std(x_test[:,j])
    if test_std != 0:
        x_test[:,j] = \
            np.divide(x_test[:,j] - np.mean(x_test[:,j]),
                      np.std(x_test[:,j]))

In [32]:
print(np.shape(x_test))

(37661, 23)


In [33]:
print(x_test[0])

[ 0.38537524  1.3166926   0.65127962  1.         -0.23222022 -1.15704656
 -0.41129133  2.23731563 -0.26412652  0.11093225  1.69230879 -0.03091436
  0.         -0.22427582  0.92804801 -0.81476327 -0.09169316 -0.23893834
 -0.03754031  1.34197185 -0.02107569  0.70466197 -0.81171682]


In [34]:
print(std_nonzero_indices)

[0, 1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22]


In [35]:
# Remove features from the data that have standard
# deviation of 0 in the training set.
x_test = x_test[:, std_nonzero_indices]

# x_test = x_test[:, good_features]

In [36]:
print(np.shape(x_test))

(37661, 22)


In [37]:
print(x_test[0])

[ 0.38537524  1.3166926   0.65127962 -0.23222022 -1.15704656 -0.41129133
  2.23731563 -0.26412652  0.11093225  1.69230879 -0.03091436  0.
 -0.22427582  0.92804801 -0.81476327 -0.09169316 -0.23893834 -0.03754031
  1.34197185 -0.02107569  0.70466197 -0.81171682]


In [38]:
y_output = xgb1.predict(x_test)

In [39]:
y_output_lines2 = []
for i in range(len(y_output)):
    y_output_lines2.append([i, y_output[i]])
np.savetxt("submission.csv", y_output_lines2, fmt='%d,%f')

# Test Output 2012

In [40]:
# # Load the 2012 test data.
# X_test2 = load_data("test_2012.csv")
# ids2 = X_test2[:,0]
# x_test2 = X_test2[:, 3:]

In [41]:
# # Normalizing the data.
# for j in range(len(x_test2[0])):
#     test_std = np.std(x_test2[:,j])
#     if test_std != 0:
#         x_test2[:,j] = \
#             np.divide(x_test2[:,j] - np.mean(x_test2[:,j]),
#                       np.std(x_test2[:,j]))

In [42]:
# # Remove features from the data that have standard
# # deviation of 0 in the training set.
# x_test2 = x_test2[:, std_nonzero_indices]

In [43]:
# y_output2 = rf.predict(x_test2)

In [44]:
# y_output_lines3 = []
# for i in range(len(y_output2)):
#     y_output_lines3.append([i, y_output2[i]])
# np.savetxt("2012_submission.csv", y_output_lines3, fmt='%d,%f')