# CS 155 Final Exam Kaggle Competition
# Philip Carr
# Model: XGBoost (from sklearn)

In [1]:
import matplotlib.pyplot as plt
import numpy as np

from sklearn.metrics import roc_auc_score
from sklearn.datasets import make_classification

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score

from sklearn.metrics import mean_squared_error

import xgboost as xgb

def load_data(filename, skiprows = 1):
    """
    Function loads data stored in the file filename and returns it as a numpy ndarray.
    
    Inputs:
        filename: given as a string.
        
    Outputs:
        Data contained in the file, returned as a numpy ndarray
    """
    return np.loadtxt(filename, skiprows=skiprows, delimiter=',')

  from numpy.core.umath_tests import inner1d


## Training Data Processing

Load the data and divide it into training and validation sets:

In [2]:
# Load the training data.
X = load_data("X_train.csv")
N = len(X)

data = X[:, 3:]

Y = load_data("y_train.csv")
label = Y

train_percent = 70.
train_size = int(N * train_percent / 100)

# Randomly split the training data into training
# and validation sets.
random_order = np.random.permutation(np.arange(N))

x_train = data[random_order[0:train_size]]
y_train = label[random_order[0:train_size]]
x_validation = data[random_order[train_size:]]
y_validation = label[random_order[train_size:]]

In [3]:
print(data)
print(label)

[[ 6. 12. 19. ... -1.  0.  1.]
 [ 6. 12. 25. ... -1.  0.  1.]
 [ 6. 12. 42. ... -1.  0.  1.]
 ...
 [ 3. 16. 39. ... -1.  1.  1.]
 [ 3. 16. 40. ... -1.  0. 67.]
 [ 3. 16. 40. ... -1.  0. 53.]]
[ 0. 18.  3. ...  3.  0.  3.]


In [4]:
print(np.shape(X))

(56490, 26)


Normalize the Data

In [5]:
print(x_train)

[[ 0. 19. 53. ... -1.  0.  1.]
 [ 1. 11. 32. ... -1.  0. 25.]
 [ 3. 14. 28. ... -1.  0.  8.]
 ...
 [ 1. 13.  3. ... -1.  0. 16.]
 [ 1. 16. 30. ... -1.  1.  4.]
 [ 1. 11.  1. ... -1.  0. 58.]]


In [6]:
print(np.shape(x_train))

(39543, 23)


In [7]:
y_train

array([ 19., 154.,   0., ...,  26.,  74., 161.])

In [8]:
# Normalizing the data.
train_mean_array = np.zeros(len(x_train[0]))
train_std_array = np.zeros(len(x_train[0]))
std_nonzero_indices = []
for j in range(len(x_train[0])):
    train_mean_array[j] = np.mean(x_train[:,j])
    train_std_array[j] = np.std(x_train[:,j])
    if train_std_array[j] != 0:
        std_nonzero_indices.append(j)
        x_train[:,j] = \
            np.divide(x_train[:,j] - train_mean_array[j],
                      train_std_array[j])
    if np.std(x_validation[:,j]) != 0:
        x_validation[:,j] = \
            np.divide(x_validation[:,j] - np.mean(x_validation[:,j]),
                      np.std(x_validation[:,j]))

In [9]:
# Remove features from the data that have standard
# deviation of 0 in the training set.
print(std_nonzero_indices)
x_train = x_train[:, std_nonzero_indices]
x_validation = x_validation[:, std_nonzero_indices]

[0, 1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22]


In [10]:
# good_features = [1, 2, 3, 4, 5, 9, 10, 14, 15, 19, 21, 22]
# x_train = x_train[:, good_features]
# x_validation = x_validation[:, good_features]

In [11]:
print(x_train)

[[-1.41405561  2.53725289  1.36097346 ... -0.02237569 -0.2089506
  -0.86431753]
 [-0.80726891 -0.46653336  0.18100094 ... -0.02237569 -0.2089506
   0.27322835]
 [ 0.40630448  0.65988648 -0.04375574 ... -0.02237569 -0.2089506
  -0.53253331]
 ...
 [-0.80726891  0.2844132  -1.44848493 ... -0.02237569 -0.2089506
  -0.15335136]
 [-0.80726891  1.41083305  0.0686226  ... -0.02237569  0.17531754
  -0.72212429]
 [-0.80726891 -0.46653336 -1.56086327 ... -0.02237569 -0.2089506
   1.83735393]]


In [12]:
print(np.shape(x_train))

(39543, 21)


In [13]:
print(y_train)

[ 19. 154.   0. ...  26.  74. 161.]


In [14]:
dtrain = xgb.DMatrix(x_train, label=y_train)
dvalidation = xgb.DMatrix(x_validation, label=y_validation)
#dtest = xgb.DMatrix(x_test)
# specify parameters via map
param = {'max_depth':3, 'eta':0.5, 'silent':1,
         'objective':'reg:linear', 'eval_metric':'rmse',
         'scale_pos_rate':1, 'n_estimators':1000}
num_round = 3

In [15]:
bst = None
for num_r in range(2, 21):
    #print(num_r)
    #print(xgb.cv(param, dtrain, num_r, 5))
    
    #print(xgb.cv(param, dtrain, num_round, 5))
    print()
    print(num_r)
    bst = xgb.train(param, dtrain, num_boost_round=num_r)
    # make prediction
    
    preds = bst.predict(dtrain)
    print(mean_squared_error(y_train, preds))
    
    preds = bst.predict(dvalidation)
    print(mean_squared_error(y_validation, preds))
    
    #preds = bst.predict(dtest)


2
2795.589971417476
2820.907695658054

3
2260.2117014292307
2352.714476971721

4
2037.8595163135544
2287.084605910146

5
1923.1239921270158
2200.970136016111

6
1847.1668339143414
2128.178120746054

7
1791.6149205742101
2089.5828924643542

8
1715.8461236132855
2018.5500040098632

9
1685.9586529297555
1975.0890765422494

10
1669.7721747903113
1965.9982478463355

11
1629.3882856379387
1912.2722620374693

12
1611.5745076671246
1896.0565885936328

13
1588.3544593521424
1855.2085969430148

14
1572.4075283012537
1843.7889251856127

15
1546.7641444046842
1820.0209136578649

16
1532.2743057151365
1803.557912200076

17
1526.1320759129826
1794.3146579266381

18
1499.3851702610232
1761.2368288546154

19
1480.6084586086545
1744.8284991801154

20
1477.7609084695237
1742.8606434663247


In [16]:
print(bst)

<xgboost.core.Booster object at 0x000001D36415A550>


# XGBoost Initialization and Fitting

In [17]:
from xgboost.sklearn import XGBRegressor
from sklearn.model_selection import GridSearchCV

xgb1 = XGBRegressor(max_depth=6, learning_rate=0.1, n_estimators=300,
                    silent=True, objective='reg:linear',
                    booster='gbtree', n_jobs=4, nthread=None,
                    gamma=0, min_child_weight=6, max_delta_step=0,
                    subsample=0.8, colsample_bytree=0.8,
                    colsample_bylevel=1, reg_alpha=0.20, 
                    reg_lambda=1, scale_pos_weight=1, 
                    base_score=0.5, random_state=0, 
                    seed=None, missing=None, importance_type='gain')
xgb1.fit(x_train, y_train, eval_metric='rmse')

param_test1 = {
 'max_depth':range(4,7),
 'min_child_weight':range(4,7)
}  # result: max_depth = 5, max_child_weight = 6 (old)
param_test2 = {
 'n_estimators':range(150, 310, 25)
} # result: n_estimators = 200 (old)
param_test3 = {
    'max_depth':range(4,7),
    'min_child_weight':range(4,7),
    'n_estimators':range(150, 310, 25)
} # result: max_depth = 6, max_child_weight = 4, n_estimators = 300
param_test4 = {
    "objective":["reg:linear", "reg:logistic", "reg:tweedie"]
    'learning_rate':[0.1, 0.01, 0.001],
    'booster':["gbtree", "gblinear", "dart"],
    "gamma":[1, 0.1, 0.01, 0.001],
    "max_delta_step":[0, 1]
} # result: 
gsearch1 = GridSearchCV(
    estimator = XGBRegressor( learning_rate =0.1, n_estimators=300,
                             max_depth=6, min_child_weight=4, gamma=0,
                             subsample=0.8, colsample_bytree=0.8,
                             objective= 'reg:linear', nthread=4,
                             scale_pos_weight=1, seed=27),
    param_grid = param_test3, scoring='neg_mean_squared_error',
    n_jobs=4,iid=False, cv=3, verbose=999)
gsearch1.fit(x_train, y_train)
print(gsearch1.best_params_, gsearch1.best_score_)

Fitting 3 folds for each of 63 candidates, totalling 189 fits
Memmaping (shape=(39543, 21), dtype=float64) to new file C:\Users\Phil\AppData\Local\Temp\joblib_memmaping_pool_25368_2007491739600\25368-2007493604744-cad57b6e449d99ea97109bbbc4ebf80f.pkl
Pickling array (shape=(39543,), dtype=float64).
Pickling array (shape=(26362,), dtype=int32).
Pickling array (shape=(13181,), dtype=int32).
Memmaping (shape=(39543, 21), dtype=float64) to old file C:\Users\Phil\AppData\Local\Temp\joblib_memmaping_pool_25368_2007491739600\25368-2007493604744-cad57b6e449d99ea97109bbbc4ebf80f.pkl
Pickling array (shape=(39543,), dtype=float64).
Pickling array (shape=(26362,), dtype=int32).
Pickling array (shape=(13181,), dtype=int32).
Memmaping (shape=(39543, 21), dtype=float64) to old file C:\Users\Phil\AppData\Local\Temp\joblib_memmaping_pool_25368_2007491739600\25368-2007493604744-cad57b6e449d99ea97109bbbc4ebf80f.pkl
Pickling array (shape=(39543,), dtype=float64).
Pickling array (shape=(26362,), dtype=int32

[Parallel(n_jobs=4)]: Done  18 tasks      | elapsed:   51.3s
Memmaping (shape=(39543, 21), dtype=float64) to old file C:\Users\Phil\AppData\Local\Temp\joblib_memmaping_pool_25368_2007491739600\25368-2007493604744-cad57b6e449d99ea97109bbbc4ebf80f.pkl
Pickling array (shape=(39543,), dtype=float64).
Pickling array (shape=(26362,), dtype=int32).
Pickling array (shape=(13181,), dtype=int32).
[Parallel(n_jobs=4)]: Done  19 tasks      | elapsed:   56.5s
Memmaping (shape=(39543, 21), dtype=float64) to old file C:\Users\Phil\AppData\Local\Temp\joblib_memmaping_pool_25368_2007491739600\25368-2007493604744-cad57b6e449d99ea97109bbbc4ebf80f.pkl
Pickling array (shape=(39543,), dtype=float64).
Pickling array (shape=(26362,), dtype=int32).
Pickling array (shape=(13181,), dtype=int32).
[Parallel(n_jobs=4)]: Done  20 tasks      | elapsed:   59.0s
Memmaping (shape=(39543, 21), dtype=float64) to old file C:\Users\Phil\AppData\Local\Temp\joblib_memmaping_pool_25368_2007491739600\25368-2007493604744-cad57b6

[Parallel(n_jobs=4)]: Done  40 tasks      | elapsed:  1.8min
Memmaping (shape=(39543, 21), dtype=float64) to old file C:\Users\Phil\AppData\Local\Temp\joblib_memmaping_pool_25368_2007491739600\25368-2007493604744-cad57b6e449d99ea97109bbbc4ebf80f.pkl
Pickling array (shape=(39543,), dtype=float64).
Pickling array (shape=(26362,), dtype=int32).
Pickling array (shape=(13181,), dtype=int32).
[Parallel(n_jobs=4)]: Done  41 tasks      | elapsed:  1.9min
Memmaping (shape=(39543, 21), dtype=float64) to old file C:\Users\Phil\AppData\Local\Temp\joblib_memmaping_pool_25368_2007491739600\25368-2007493604744-cad57b6e449d99ea97109bbbc4ebf80f.pkl
Pickling array (shape=(39543,), dtype=float64).
Pickling array (shape=(26362,), dtype=int32).
Pickling array (shape=(13181,), dtype=int32).
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:  1.9min
Memmaping (shape=(39543, 21), dtype=float64) to old file C:\Users\Phil\AppData\Local\Temp\joblib_memmaping_pool_25368_2007491739600\25368-2007493604744-cad57b6

[Parallel(n_jobs=4)]: Done  62 tasks      | elapsed:  2.8min
Memmaping (shape=(39543, 21), dtype=float64) to old file C:\Users\Phil\AppData\Local\Temp\joblib_memmaping_pool_25368_2007491739600\25368-2007493604744-cad57b6e449d99ea97109bbbc4ebf80f.pkl
Pickling array (shape=(39543,), dtype=float64).
Pickling array (shape=(26362,), dtype=int32).
Pickling array (shape=(13181,), dtype=int32).
[Parallel(n_jobs=4)]: Done  63 tasks      | elapsed:  2.8min
Memmaping (shape=(39543, 21), dtype=float64) to old file C:\Users\Phil\AppData\Local\Temp\joblib_memmaping_pool_25368_2007491739600\25368-2007493604744-cad57b6e449d99ea97109bbbc4ebf80f.pkl
Pickling array (shape=(39543,), dtype=float64).
Pickling array (shape=(26362,), dtype=int32).
Pickling array (shape=(13181,), dtype=int32).
[Parallel(n_jobs=4)]: Done  64 tasks      | elapsed:  2.9min
Memmaping (shape=(39543, 21), dtype=float64) to old file C:\Users\Phil\AppData\Local\Temp\joblib_memmaping_pool_25368_2007491739600\25368-2007493604744-cad57b6

[Parallel(n_jobs=4)]: Done  84 tasks      | elapsed:  4.0min
Memmaping (shape=(39543, 21), dtype=float64) to old file C:\Users\Phil\AppData\Local\Temp\joblib_memmaping_pool_25368_2007491739600\25368-2007493604744-cad57b6e449d99ea97109bbbc4ebf80f.pkl
Pickling array (shape=(39543,), dtype=float64).
Pickling array (shape=(26362,), dtype=int32).
Pickling array (shape=(13181,), dtype=int32).
[Parallel(n_jobs=4)]: Done  85 tasks      | elapsed:  4.0min
Memmaping (shape=(39543, 21), dtype=float64) to old file C:\Users\Phil\AppData\Local\Temp\joblib_memmaping_pool_25368_2007491739600\25368-2007493604744-cad57b6e449d99ea97109bbbc4ebf80f.pkl
Pickling array (shape=(39543,), dtype=float64).
Pickling array (shape=(26362,), dtype=int32).
Pickling array (shape=(13181,), dtype=int32).
[Parallel(n_jobs=4)]: Done  86 tasks      | elapsed:  4.1min
Memmaping (shape=(39543, 21), dtype=float64) to old file C:\Users\Phil\AppData\Local\Temp\joblib_memmaping_pool_25368_2007491739600\25368-2007493604744-cad57b6

[Parallel(n_jobs=4)]: Done 106 tasks      | elapsed:  5.2min
Memmaping (shape=(39543, 21), dtype=float64) to old file C:\Users\Phil\AppData\Local\Temp\joblib_memmaping_pool_25368_2007491739600\25368-2007493604744-cad57b6e449d99ea97109bbbc4ebf80f.pkl
Pickling array (shape=(39543,), dtype=float64).
Pickling array (shape=(26362,), dtype=int32).
Pickling array (shape=(13181,), dtype=int32).
[Parallel(n_jobs=4)]: Done 107 tasks      | elapsed:  5.2min
Memmaping (shape=(39543, 21), dtype=float64) to old file C:\Users\Phil\AppData\Local\Temp\joblib_memmaping_pool_25368_2007491739600\25368-2007493604744-cad57b6e449d99ea97109bbbc4ebf80f.pkl
Pickling array (shape=(39543,), dtype=float64).
Pickling array (shape=(26362,), dtype=int32).
Pickling array (shape=(13181,), dtype=int32).
[Parallel(n_jobs=4)]: Done 108 tasks      | elapsed:  5.3min
Memmaping (shape=(39543, 21), dtype=float64) to old file C:\Users\Phil\AppData\Local\Temp\joblib_memmaping_pool_25368_2007491739600\25368-2007493604744-cad57b6

[Parallel(n_jobs=4)]: Done 128 tasks      | elapsed:  6.4min
Memmaping (shape=(39543, 21), dtype=float64) to old file C:\Users\Phil\AppData\Local\Temp\joblib_memmaping_pool_25368_2007491739600\25368-2007493604744-cad57b6e449d99ea97109bbbc4ebf80f.pkl
Pickling array (shape=(39543,), dtype=float64).
Pickling array (shape=(26362,), dtype=int32).
Pickling array (shape=(13181,), dtype=int32).
[Parallel(n_jobs=4)]: Done 129 tasks      | elapsed:  6.4min
Memmaping (shape=(39543, 21), dtype=float64) to old file C:\Users\Phil\AppData\Local\Temp\joblib_memmaping_pool_25368_2007491739600\25368-2007493604744-cad57b6e449d99ea97109bbbc4ebf80f.pkl
Pickling array (shape=(39543,), dtype=float64).
Pickling array (shape=(26362,), dtype=int32).
Pickling array (shape=(13181,), dtype=int32).
[Parallel(n_jobs=4)]: Done 130 tasks      | elapsed:  6.4min
Memmaping (shape=(39543, 21), dtype=float64) to old file C:\Users\Phil\AppData\Local\Temp\joblib_memmaping_pool_25368_2007491739600\25368-2007493604744-cad57b6

[Parallel(n_jobs=4)]: Done 150 tasks      | elapsed:  7.7min
Memmaping (shape=(39543, 21), dtype=float64) to old file C:\Users\Phil\AppData\Local\Temp\joblib_memmaping_pool_25368_2007491739600\25368-2007493604744-cad57b6e449d99ea97109bbbc4ebf80f.pkl
Pickling array (shape=(39543,), dtype=float64).
Pickling array (shape=(26362,), dtype=int32).
Pickling array (shape=(13181,), dtype=int32).
[Parallel(n_jobs=4)]: Done 151 tasks      | elapsed:  7.8min
Memmaping (shape=(39543, 21), dtype=float64) to old file C:\Users\Phil\AppData\Local\Temp\joblib_memmaping_pool_25368_2007491739600\25368-2007493604744-cad57b6e449d99ea97109bbbc4ebf80f.pkl
Pickling array (shape=(39543,), dtype=float64).
Pickling array (shape=(26362,), dtype=int32).
Pickling array (shape=(13181,), dtype=int32).
[Parallel(n_jobs=4)]: Done 152 tasks      | elapsed:  7.8min
Memmaping (shape=(39543, 21), dtype=float64) to old file C:\Users\Phil\AppData\Local\Temp\joblib_memmaping_pool_25368_2007491739600\25368-2007493604744-cad57b6

[Parallel(n_jobs=4)]: Done 172 tasks      | elapsed:  9.1min
Memmaping (shape=(39543, 21), dtype=float64) to old file C:\Users\Phil\AppData\Local\Temp\joblib_memmaping_pool_25368_2007491739600\25368-2007493604744-cad57b6e449d99ea97109bbbc4ebf80f.pkl
Pickling array (shape=(39543,), dtype=float64).
Pickling array (shape=(26362,), dtype=int32).
Pickling array (shape=(13181,), dtype=int32).
[Parallel(n_jobs=4)]: Done 173 tasks      | elapsed:  9.1min
Memmaping (shape=(39543, 21), dtype=float64) to old file C:\Users\Phil\AppData\Local\Temp\joblib_memmaping_pool_25368_2007491739600\25368-2007493604744-cad57b6e449d99ea97109bbbc4ebf80f.pkl
Pickling array (shape=(39543,), dtype=float64).
Pickling array (shape=(26362,), dtype=int32).
Pickling array (shape=(13181,), dtype=int32).
[Parallel(n_jobs=4)]: Done 174 tasks      | elapsed:  9.2min
Memmaping (shape=(39543, 21), dtype=float64) to old file C:\Users\Phil\AppData\Local\Temp\joblib_memmaping_pool_25368_2007491739600\25368-2007493604744-cad57b6

In [18]:
for key in gsearch1.cv_results_:
    print(key, gsearch1.cv_results_[key])

mean_fit_time [ 6.18578966  7.98963189  9.78649354 10.05145216 10.52551778 12.0029006
 14.74922188  7.03850929  7.58039387  9.33403707  9.74294305 11.37458007
 12.5870049  12.8589437   6.52953768  8.68909558  9.27519504  9.86794249
 11.43774565 11.82205041 13.14650861  8.55545251  9.80910071 12.1548268
 12.41446575 14.15115547 15.82733933 18.35591038  9.74493917 10.07904545
 10.90550081 12.91346375 13.31106734 16.04575555 18.22393012  8.77054477
 11.08668375 11.15716203 12.25954668 14.07502453 14.45334737 16.05639354
 10.2489241  11.14153703 13.47319102 14.14905119 15.97389952 19.57284745
 18.88972465  9.68533961 11.74326134 12.37856221 15.65147551 15.94834868
 17.38317831 20.34359455  9.79945954 10.4799734  12.62490296 14.16844233
 15.92607411 18.01382581 14.568705  ]
std_fit_time [0.74314933 0.07055546 0.33709692 0.45333944 0.08454865 0.35080372
 0.30868262 0.45309194 0.12980171 0.1829011  0.18288129 0.28313484
 0.37038673 0.11319046 0.09418346 0.03584222 0.37523137 0.25014121
 0.141



In [19]:
y_output_train = xgb1.predict(x_train)

y_output_validation = xgb1.predict(x_validation)

In [20]:
y_output_train = gsearch1.predict(x_train)

y_output_validation = gsearch1.predict(x_validation)

In [21]:
y_output_train

array([ 16.20243 , 115.057724,  -1.015309, ...,  31.891134,  61.802612,
       120.41816 ], dtype=float32)

In [22]:
# Printing the accuracy of the model.
# train_score = rf.score(x_train, y_train)
# print('Train score:', train_score)
# train_auc = roc_auc_score(y_train, y_output_train)
# print('Train auc:', train_auc)
train_mse = mean_squared_error(y_train, y_output_train)
print('Train mse:', train_mse)

Train mse: 886.5454979398751


In [23]:
y_train

array([ 19., 154.,   0., ...,  26.,  74., 161.])

In [24]:
y_output_lines = []
for i in range(len(y_output_train)):
    y_output_lines.append([i, y_output_train[i], y_train[i]])
np.savetxt("train_output.csv", y_output_lines, fmt='%d,%f,%f')

## Validation Results

In [25]:
## Printing the accuracy of the model, according to the loss function specified in model.compile above.
# validation_score = rf.score(x_validation, y_validation)
# print('Validation score:', validation_score)
# y_output_validation = rf.predict(x_validation)
# validation_auc = roc_auc_score(y_validation, y_output_validation)
# print('Validation auc:', validation_auc)
validation_mse = mean_squared_error(y_validation, y_output_validation)
print('Validation mse:', validation_mse)

Validation mse: 1232.0669322119059


In [26]:
y_output_lines = []
for i in range(len(y_output_validation)):
    y_output_lines.append([i, y_output_validation[i], y_validation[i]])
# np.savetxt("2008_validation_output.csv", y_output_lines, fmt='%d,%f,%f')
np.savetxt("validation_output.csv", y_output_lines, fmt='%d,%f,%f')

In [27]:
# np.mean(cross_val_score(rf, data, label, cv=2, scoring="roc_auc"))
# np.mean(cross_val_score(rf, data, label, cv=2,
#                         scoring="neg_mean_squared_error"))

In [28]:
# np.mean(cross_val_score(rf, data, label, cv=3, scoring="roc_auc"))
# np.mean(cross_val_score(rf, data, label, cv=3,
#                         scoring="neg_mean_squared_error"))

In [29]:
# np.mean(cross_val_score(rf, data, label, cv=4, scoring="roc_auc"))
# np.mean(cross_val_score(rf, data, label, cv=4,
#                         scoring="neg_mean_squared_error"))

# Test Output

In [30]:
# Load the 2008 test data.
X_test = load_data("X_test.csv")
ids = np.arange(len(X_test))
x_test = X_test[:, 3:]

In [31]:
# Normalizing the data.
for j in range(len(x_test[0])):
    test_std = np.std(x_test[:,j])
    if test_std != 0:
        x_test[:,j] = \
            np.divide(x_test[:,j] - np.mean(x_test[:,j]),
                      np.std(x_test[:,j]))

In [32]:
print(np.shape(x_test))

(37661, 23)


In [33]:
print(x_test[0])

[ 0.38537524  1.3166926   0.65127962  1.         -0.23222022 -1.15704656
 -0.41129133  2.23731563 -0.26412652  0.11093225  1.69230879 -0.03091436
  0.         -0.22427582  0.92804801 -0.81476327 -0.09169316 -0.23893834
 -0.03754031  1.34197185 -0.02107569  0.70466197 -0.81171682]


In [34]:
print(std_nonzero_indices)

[0, 1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22]


In [35]:
# Remove features from the data that have standard
# deviation of 0 in the training set.
x_test = x_test[:, std_nonzero_indices]

# x_test = x_test[:, good_features]

In [36]:
print(np.shape(x_test))

(37661, 21)


In [37]:
print(x_test[0])

[ 0.38537524  1.3166926   0.65127962 -0.23222022 -1.15704656 -0.41129133
  2.23731563 -0.26412652  0.11093225  1.69230879 -0.03091436 -0.22427582
  0.92804801 -0.81476327 -0.09169316 -0.23893834 -0.03754031  1.34197185
 -0.02107569  0.70466197 -0.81171682]


In [38]:
y_output = xgb1.predict(x_test)

In [39]:
y_output_lines2 = []
for i in range(len(y_output)):
    y_output_lines2.append([i, y_output[i]])
np.savetxt("submission.csv", y_output_lines2, fmt='%d,%f')

# Test Output 2012

In [40]:
# # Load the 2012 test data.
# X_test2 = load_data("test_2012.csv")
# ids2 = X_test2[:,0]
# x_test2 = X_test2[:, 3:]

In [41]:
# # Normalizing the data.
# for j in range(len(x_test2[0])):
#     test_std = np.std(x_test2[:,j])
#     if test_std != 0:
#         x_test2[:,j] = \
#             np.divide(x_test2[:,j] - np.mean(x_test2[:,j]),
#                       np.std(x_test2[:,j]))

In [42]:
# # Remove features from the data that have standard
# # deviation of 0 in the training set.
# x_test2 = x_test2[:, std_nonzero_indices]

In [43]:
# y_output2 = rf.predict(x_test2)

In [44]:
# y_output_lines3 = []
# for i in range(len(y_output2)):
#     y_output_lines3.append([i, y_output2[i]])
# np.savetxt("2012_submission.csv", y_output_lines3, fmt='%d,%f')