# CS 155 Final Exam Kaggle Competition
# Philip Carr
# Model: XGBoost (from sklearn)

In [1]:
import matplotlib.pyplot as plt
import numpy as np

from sklearn.metrics import roc_auc_score
from sklearn.datasets import make_classification

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score

from sklearn.metrics import mean_squared_error

import xgboost as xgb

def load_data(filename, skiprows = 1):
    """
    Function loads data stored in the file filename and returns it as a numpy ndarray.
    
    Inputs:
        filename: given as a string.
        
    Outputs:
        Data contained in the file, returned as a numpy ndarray
    """
    return np.loadtxt(filename, skiprows=skiprows, delimiter=',')

  from numpy.core.umath_tests import inner1d


## Training Data Processing

Load the data and divide it into training and validation sets:

In [2]:
# Load the training data.
X = load_data("X_train.csv")
N = len(X)

data = X[:, 3:]

Y = load_data("y_train.csv")
label = Y

train_percent = 70.
train_size = int(N * train_percent / 100)

# Randomly split the training data into training
# and validation sets.
random_order = np.random.permutation(np.arange(N))

x_train = data[random_order[0:train_size]]
y_train = label[random_order[0:train_size]]
x_validation = data[random_order[train_size:]]
y_validation = label[random_order[train_size:]]

In [3]:
print(data)
print(label)

[[ 6. 12. 19. ... -1.  0.  1.]
 [ 6. 12. 25. ... -1.  0.  1.]
 [ 6. 12. 42. ... -1.  0.  1.]
 ...
 [ 3. 16. 39. ... -1.  1.  1.]
 [ 3. 16. 40. ... -1.  0. 67.]
 [ 3. 16. 40. ... -1.  0. 53.]]
[ 0. 18.  3. ...  3.  0.  3.]


In [4]:
print(np.shape(X))

(56490, 26)


Setting Categorical Features

In [5]:
# print(x_train[:5,:5])
# categorical_features = []
# for j in range(len(x_train[0])):
#     is_categorical = True
#     for i in range(len(x_train)):
#         if int(x_train[i,j]) not in [0, 1]:
#             print(j,i,x_train[i,j])
#             is_categorical = False
#             break
#     if is_categorical:
#         categorical_features.append(j)
# print(categorical_features)

[[ 4. 15.  8.  1.  0.]
 [ 2.  9.  1.  1.  0.]
 [ 2. 15. 28.  1.  0.]
 [ 1.  9.  0.  1.  0.]
 [ 2.  9. 25.  1.  0.]]
0 0 4.0
1 0 15.0
2 0 8.0
9 0 229.0
10 11 10.0
11 961 10.0
12 22519 4.0
13 0 -1.0
19 0 -1.0
20 0 -1.0
21 4 2.0
22 0 11.0
[3, 4, 5, 6, 7, 8, 14, 15, 16, 17, 18]


In [None]:
# from keras.utils import to_categorical

# total_cols = len(x_train[0]) + len(categorical_features)
# new_x_train = np.zeros(shape=(len(x_train), total_cols))
# new_x_validation = np.zeros(shape=(len(x_validation), total_cols))

# count = 0
# for j in range(total_cols):
#     if j not in categorical_features:
#         new_x_train[:,j] = x_train[:,j]
#         new_x_validation[:,j] = x_validation[:,j]
#     else:
#         feature_cols_train = to_categorical(x_train[:,j])
#         new_x_train[]
#         count += 1

Dealing with Missing Values

In [6]:
print(x_train)

[[ 4. 15.  8. ... -1.  0. 11.]
 [ 2.  9.  1. ... -1.  0.  8.]
 [ 2. 15. 28. ... -1.  0. 67.]
 ...
 [ 4. 15.  7. ... -1.  0. -1.]
 [ 4. 15. 12. ... -1.  0. -1.]
 [ 4.  9. 18. ... -1.  0. 27.]]


In [7]:
print(np.shape(x_train))

(39543, 23)


In [8]:
y_train

array([ 6.,  0.,  2., ...,  0., 94.,  3.])

In [9]:
# # Set missing values to column (feature) medians
# for j in range(len(x_train[0])):
#     col_median_train = np.median(x_train[:,j])
#     col_median_validation = np.median(x_validation[:,j])
#     for i in range(len(x_train)):
#         if x_train[i,j] == -1:
#             x_train[i,j] = col_median_train
#     for i in range(len(x_validation)):
#         if x_validation[i,j] == -1:
#             x_validation[i,j] = col_median_validation

Normalize the Data

In [10]:
# Normalizing the data.
train_mean_array = np.zeros(len(x_train[0]))
train_std_array = np.zeros(len(x_train[0]))
std_nonzero_indices = []
for j in range(len(x_train[0])):
    train_mean_array[j] = np.mean(x_train[:,j])
    train_std_array[j] = np.std(x_train[:,j])
    if train_std_array[j] != 0:
        std_nonzero_indices.append(j)
        x_train[:,j] = \
            np.divide(x_train[:,j] - train_mean_array[j],
                      train_std_array[j])
    if np.std(x_validation[:,j]) != 0:
        x_validation[:,j] = \
            np.divide(x_validation[:,j] - np.mean(x_validation[:,j]),
                      np.std(x_validation[:,j]))

In [11]:
# Remove features from the data that have standard
# deviation of 0 in the training set.
print(std_nonzero_indices)
x_train = x_train[:, std_nonzero_indices]
x_validation = x_validation[:, std_nonzero_indices]

[0, 1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22]


In [12]:
# good_features = [1, 2, 3, 4, 5, 9, 10, 14, 15, 19, 21, 22]
# x_train = x_train[:, good_features]
# x_validation = x_validation[:, good_features]

In [13]:
print(x_train)

[[ 1.01273156  1.03439973 -1.16494235 ... -0.0217789  -0.21822131
  -0.45997069]
 [-0.20076028 -1.21802424 -1.55858225 ... -0.0217789  -0.21822131
  -0.60767234]
 [-0.20076028  1.03439973 -0.04025693 ... -0.0217789  -0.21822131
   2.29712674]
 ...
 [ 1.01273156  1.03439973 -1.22117662 ... -0.0217789  -0.21822131
  -0.45997069]
 [ 1.01273156  1.03439973 -0.94000527 ... -0.0217789  -0.21822131
  -0.45997069]
 [ 1.01273156 -1.21802424 -0.60259964 ... -0.0217789  -0.21822131
   0.32777143]]


In [14]:
print(np.shape(x_train))

(39543, 22)


In [15]:
print(y_train)

[ 6.  0.  2. ...  0. 94.  3.]


In [16]:
dtrain = xgb.DMatrix(x_train, label=y_train)
dvalidation = xgb.DMatrix(x_validation, label=y_validation)
#dtest = xgb.DMatrix(x_test)
# specify parameters via map
param = {'max_depth':3, 'eta':0.5, 'silent':1,
         'objective':'reg:linear', 'eval_metric':'rmse',
         'scale_pos_rate':1, 'n_estimators':1000}
num_round = 3

In [17]:
# bst = None
# for num_r in range(2, 21):
#     #print(num_r)
#     #print(xgb.cv(param, dtrain, num_r, 5))
    
#     #print(xgb.cv(param, dtrain, num_round, 5))
#     print()
#     print(num_r)
#     bst = xgb.train(param, dtrain, num_boost_round=num_r)
#     # make prediction
    
#     preds = bst.predict(dtrain)
#     print(mean_squared_error(y_train, preds))
    
#     preds = bst.predict(dvalidation)
#     print(mean_squared_error(y_validation, preds))
    
#     #preds = bst.predict(dtest)

In [18]:
# print(bst)

# XGBoost Initialization and Fitting

In [19]:
from xgboost.sklearn import XGBRegressor
from sklearn.model_selection import GridSearchCV

xgb1 = XGBRegressor(max_depth=6, learning_rate=0.1, n_estimators=300,
                    silent=True, objective='reg:linear',
                    booster='dart', n_jobs=6, nthread=None,
                    gamma=1, min_child_weight=4, max_delta_step=0,
                    subsample=0.8, colsample_bytree=0.8,
                    colsample_bylevel=1, reg_alpha=0.20, 
                    reg_lambda=1, scale_pos_weight=1, 
                    base_score=0.5, random_state=0,
                    seed=10, missing=None, importance_type='gain')
xgb1 = XGBRegressor(max_depth=6, learning_rate=0.1, n_estimators=300,
                    silent=True, objective='reg:linear',
                    booster='gbtree', n_jobs=4, nthread=None,
                    gamma=0, min_child_weight=6, max_delta_step=0,
                    subsample=0.8, colsample_bytree=0.8,
                    colsample_bylevel=1, reg_alpha=0.20, 
                    reg_lambda=1, scale_pos_weight=1, 
                    base_score=0.5, random_state=0, 
                    seed=None, missing=None, importance_type='gain')
xgb1.fit(x_train, y_train, eval_metric='rmse')

# param_test1 = {
#  'max_depth':range(4,7),
#  'min_child_weight':range(4,7)
# }  # result: max_depth = 5, max_child_weight = 6 (old)
# param_test2 = {
#  'n_estimators':range(150, 310, 25)
# } # result: n_estimators = 200 (old)
# param_test3 = {
#     'max_depth':range(4,7),
#     'min_child_weight':range(4,7),
#     'n_estimators':range(150, 310, 25)
# } # result: max_depth = 6, max_child_weight = 4, n_estimators = 300
# param_test4 = {
#     "objective":["reg:linear"],
#     'learning_rate':[0.1, 0.01],
#     'booster':["gbtree", "gblinear", "dart"],
#     "gamma":[0, 1, 0.1, 0.01],
#     "max_delta_step":[0, 1]
# } # result: {'booster': 'dart', 'gamma': 1, 'learning_rate': 0.1, 
# # 'max_delta_step': 0, 'objective': 'reg:linear'}
# param_test5= {
#     "lambda":[0, 1, 10, 20, 100],
#     "alpha":[0, 1, 10, 20, 100],
#     "tree_method":["auto"]
# }
# param_test6= {
#     "subsample":[0.6, 0.8, 1],
#     "colsample_bytree":[0.6, 0.8, 1],
#     "colsample_bylevel":[0.6, 0.8, 1],
#     "colsample_bynode":[0.6, 0.8, 1]
# }
# gsearch1 = GridSearchCV(
#     estimator = XGBRegressor( learning_rate =0.1, n_estimators=300,
#                              max_depth=6, min_child_weight=4, gamma=1,
#                              subsample=0.8, colsample_bytree=0.8,
#                              objective= 'reg:linear', nthread=4,
#                              booster="dart", max_delta_step=0,
#                              scale_pos_weight=1, seed=27),
#     param_grid = param_test2, scoring='neg_mean_squared_error',
#     n_jobs=4,iid=False, cv=3, verbose=999)
# gsearch1.fit(x_train, y_train)
# print(gsearch1.best_params_, gsearch1.best_score_)

Fitting 3 folds for each of 7 candidates, totalling 21 fits
Memmaping (shape=(39543, 22), dtype=float64) to new file C:\Users\Phil\AppData\Local\Temp\joblib_memmaping_pool_33276_1890688592864\33276-1890623909728-67d6a6e5dc33169ea1fa36f5f81eae52.pkl
Pickling array (shape=(39543,), dtype=float64).
Pickling array (shape=(26362,), dtype=int32).
Pickling array (shape=(13181,), dtype=int32).
Memmaping (shape=(39543, 22), dtype=float64) to old file C:\Users\Phil\AppData\Local\Temp\joblib_memmaping_pool_33276_1890688592864\33276-1890623909728-67d6a6e5dc33169ea1fa36f5f81eae52.pkl
Pickling array (shape=(39543,), dtype=float64).
Pickling array (shape=(26362,), dtype=int32).
Pickling array (shape=(13181,), dtype=int32).
Memmaping (shape=(39543, 22), dtype=float64) to old file C:\Users\Phil\AppData\Local\Temp\joblib_memmaping_pool_33276_1890688592864\33276-1890623909728-67d6a6e5dc33169ea1fa36f5f81eae52.pkl
Pickling array (shape=(39543,), dtype=float64).
Pickling array (shape=(26362,), dtype=int32).

[Parallel(n_jobs=4)]: Done  21 out of  21 | elapsed:  6.9min remaining:    0.0s
[Parallel(n_jobs=4)]: Done  21 out of  21 | elapsed:  6.9min finished
{'n_estimators': 300} -1044.9487559310612


In [20]:
# for key in gsearch1.cv_results_:
#     print(key, gsearch1.cv_results_[key])

mean_fit_time [ 29.35117141  42.54057837  56.82270495  73.99692098  86.77332314
 105.45763954 106.90444795]
std_fit_time [ 3.7598946   1.58781814  2.54549949  1.91874789  0.56474188  0.99593386
 13.0936564 ]
mean_score_time [0.12134266 0.13397519 0.14461295 0.122672   0.11170117 0.14162143
 0.16555738]
std_score_time [0.02774981 0.02746279 0.02750698 0.02411489 0.01340439 0.01999621
 0.05148247]
param_n_estimators [150 175 200 225 250 275 300]
params [{'n_estimators': 150}, {'n_estimators': 175}, {'n_estimators': 200}, {'n_estimators': 225}, {'n_estimators': 250}, {'n_estimators': 275}, {'n_estimators': 300}]
split0_test_score [-1078.92770956 -1062.08743978 -1053.14916671 -1045.0364855
 -1039.05719964 -1035.36826096 -1030.52384493]
split1_test_score [-1093.80299333 -1076.84385662 -1063.87563929 -1054.41258589
 -1043.43292015 -1039.66784203 -1036.69652592]
split2_test_score [-1110.12946633 -1098.86526037 -1091.04196942 -1081.3741436
 -1080.00148588 -1073.55317379 -1067.62589695]
mean_te



In [21]:
y_output_train = xgb1.predict(x_train)

y_output_validation = xgb1.predict(x_validation)

In [22]:
# y_output_train = gsearch1.predict(x_train)

# y_output_validation = gsearch1.predict(x_validation)

In [23]:
y_output_train

array([ 6.5592256,  1.0467567, 12.496606 , ..., 10.44915  , 61.841503 ,
       -1.5584807], dtype=float32)

In [24]:
# Printing the accuracy of the model.
# train_score = rf.score(x_train, y_train)
# print('Train score:', train_score)
# train_auc = roc_auc_score(y_train, y_output_train)
# print('Train auc:', train_auc)
train_mse = mean_squared_error(y_train, y_output_train)
print('Train mse:', train_mse)

Train mse: 822.2288875386835


In [25]:
y_train

array([ 6.,  0.,  2., ...,  0., 94.,  3.])

In [26]:
y_output_lines = []
for i in range(len(y_output_train)):
    y_output_lines.append([i, y_output_train[i], y_train[i]])
np.savetxt("train_output.csv", y_output_lines, fmt='%d,%f,%f')

## Validation Results

In [27]:
## Printing the accuracy of the model, according to the loss function specified in model.compile above.
# validation_score = rf.score(x_validation, y_validation)
# print('Validation score:', validation_score)
# y_output_validation = rf.predict(x_validation)
# validation_auc = roc_auc_score(y_validation, y_output_validation)
# print('Validation auc:', validation_auc)
validation_mse = mean_squared_error(y_validation, y_output_validation)
print('Validation mse:', validation_mse)

Validation mse: 1811.3958576982693


In [28]:
y_output_lines = []
for i in range(len(y_output_validation)):
    y_output_lines.append([i, y_output_validation[i], y_validation[i]])
# np.savetxt("2008_validation_output.csv", y_output_lines, fmt='%d,%f,%f')
np.savetxt("validation_output.csv", y_output_lines, fmt='%d,%f,%f')

In [29]:
# np.mean(cross_val_score(rf, data, label, cv=2, scoring="roc_auc"))
# np.mean(cross_val_score(rf, data, label, cv=2,
#                         scoring="neg_mean_squared_error"))

In [30]:
# np.mean(cross_val_score(rf, data, label, cv=3, scoring="roc_auc"))
# np.mean(cross_val_score(rf, data, label, cv=3,
#                         scoring="neg_mean_squared_error"))

In [31]:
# np.mean(cross_val_score(rf, data, label, cv=4, scoring="roc_auc"))
# np.mean(cross_val_score(rf, data, label, cv=4,
#                         scoring="neg_mean_squared_error"))

# Test Output

In [32]:
# Load the 2008 test data.
X_test = load_data("X_test.csv")
ids = np.arange(len(X_test))
x_test = X_test[:, 3:]

In [33]:
# Set missing values to column (feature) medians
for j in range(len(x_test[0])):
    col_median_test = np.median(x_test[:,j])
    for i in range(len(x_test)):
        if x_test[i,j] == -1:
            x_test[i,j] = col_median_test

In [34]:
# Normalizing the data.
for j in range(len(x_test[0])):
    test_std = np.std(x_test[:,j])
    if test_std != 0:
        x_test[:,j] = \
            np.divide(x_test[:,j] - np.mean(x_test[:,j]),
                      np.std(x_test[:,j]))

In [35]:
print(np.shape(x_test))

(37661, 23)


In [36]:
print(x_test[0])

[ 0.38537524  1.3166926   0.65127962  1.         -0.23222022 -1.15704656
 -0.41129133  2.23731563 -0.26412652  0.10250787  1.69230879 -0.03091436
  0.         -0.22427582  0.92804801 -0.81476327 -0.09169316 -0.23893834
 -0.03754031  1.33763082 -0.02107569  0.70466197 -0.90230796]


In [37]:
print(std_nonzero_indices)

[0, 1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22]


In [38]:
# Remove features from the data that have standard
# deviation of 0 in the training set.
x_test = x_test[:, std_nonzero_indices]

# x_test = x_test[:, good_features]

In [39]:
print(np.shape(x_test))

(37661, 22)


In [40]:
print(x_test[0])

[ 0.38537524  1.3166926   0.65127962 -0.23222022 -1.15704656 -0.41129133
  2.23731563 -0.26412652  0.10250787  1.69230879 -0.03091436  0.
 -0.22427582  0.92804801 -0.81476327 -0.09169316 -0.23893834 -0.03754031
  1.33763082 -0.02107569  0.70466197 -0.90230796]


In [41]:
y_output = xgb1.predict(x_test)

# y_output = gsearch1.predict(x_test)

In [42]:
y_output_lines2 = []
for i in range(len(y_output)):
    y_output_lines2.append([i, y_output[i]])
np.savetxt("submission.csv", y_output_lines2, fmt='%d,%f')

# Test Output 2012

In [43]:
# # Load the 2012 test data.
# X_test2 = load_data("test_2012.csv")
# ids2 = X_test2[:,0]
# x_test2 = X_test2[:, 3:]

In [44]:
# # Normalizing the data.
# for j in range(len(x_test2[0])):
#     test_std = np.std(x_test2[:,j])
#     if test_std != 0:
#         x_test2[:,j] = \
#             np.divide(x_test2[:,j] - np.mean(x_test2[:,j]),
#                       np.std(x_test2[:,j]))

In [45]:
# # Remove features from the data that have standard
# # deviation of 0 in the training set.
# x_test2 = x_test2[:, std_nonzero_indices]

In [46]:
# y_output2 = rf.predict(x_test2)

In [47]:
# y_output_lines3 = []
# for i in range(len(y_output2)):
#     y_output_lines3.append([i, y_output2[i]])
# np.savetxt("2012_submission.csv", y_output_lines3, fmt='%d,%f')