In [None]:
import pandas as pd
import numpy as np
from time import time
from data.data import load_data
from sklearn.model_selection import GridSearchCV, train_test_split
from xgboost import XGBRegressor
from sklearn.metrics import accuracy_score, mean_squared_error

In [None]:
# TODO: Read prediction files and add to datas list
convnet_val = pd.read_csv('./predictions/validation_convnet_augmentation.csv')
recnn_val = pd.read_csv('./predictions/validation_recnn_no_augmentation.csv')
datas = [convnet_val, recnn_val]

# TODO: add test prediction files
convnet_test = pd.read_csv('./predictions/test_convnet_augmentation.csv')
recnn_test = pd.read_csv('./predictions/test_recnn_no_augmentation.csv')
tests = [convnet_test, recnn_test]

# Create x and y datasets
x_total_train = np.stack(([x['prediction'] for x in datas]), axis=1)
_, (_, y_pred_val, _), _ = load_data()
y_total_train = np.ravel(y_pred_val)
x_train, x_valid, y_train, y_valid = train_test_split(x_total_train, y_total_train)

# Create x testset
x_test = np.stack(([x['prediction'] for x in tests]), axis=1)

In [None]:
# Parameter tuning
xg_model = XGBRegressor(n_estimators=100, seed = 1234, objective='reg:squarederror')
param_dict = {'max_depth': [1,2,3,4,5],
              'min_child_weight': [2,3,4,5,6,7],
              'learning_rate': [0.05, 0.1,0.15],
              'gamma': [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6]
}

start = time()
grid_search = GridSearchCV(xg_model, param_dict, cv=5, verbose=1)
grid_search.fit(x_train, y_train)
print("GridSearch took %.2f seconds to complete." % (time()-start))
display(grid_search.best_params_)
best = grid_search.best_params_

In [None]:
# Fitting the model
xg_model = XGBRegressor(objective='reg:squarederror',
                        n_estimators = 1000,
                        learning_rate = best['learning_rate'],
                        max_depth = best['max_depth'],
                        min_child_weight = best['min_child_weight'],
                        gamma = best['gamma'],
                        seed = 1234)
start = time()
xg_model.fit(x_train, y_train)
xg_preds = xg_model.predict(x_valid)
print("Model took %.2f seconds to complete." % (time()-start))
print("RMSE: %.4f" % np.sqrt(mean_squared_error(y_valid, xg_preds)))

In [None]:
# Train on whole dataset
xg_model_total = XGBRegressor(objective='reg:squarederror',
                        n_estimators = 1000,
                        learning_rate = best['learning_rate'],
                        max_depth = best['max_depth'],
                        min_child_weight = best['min_child_weight'],
                        gamma = best['gamma'],
                        seed = 1234)
xg_model_total.fit(x_total_train, y_total_train)

In [None]:
# Predict on model
xg_preds = np.expm1(xg_model_total.predict(x_test))

In [None]:
data1.to_csv('./predictions/predictions_xgboost_ensemble.csv', index=False)