In [1]:
# import standard libraries
import numpy as np
import os as os
import pandas as pd
import time

In [2]:
# import modelling libraries
from sklearn import ensemble, model_selection
import numerox as nx

In [3]:
# set the data working directory
os.chdir(os.path.join(os.getcwd(), "..", "data"))

In [4]:
# download the latest numerai dataset
# data = nx.download("numerai_dataset.zip")

# to make it faster use an existing dataset
data = nx.load_zip("numerai_dataset.zip")

In [5]:
# environment settings
MODEL_NAME = "random-forest"
FOLDER_NAME = "submission"

In [6]:
# extend the random forest model class offered by numerox
class randomforest(nx.Model):

    def __init__(self, params, seed=0):
        self.p = params
        self.seed = seed

    def fit_predict(self, dfit, dpre, tournament):
        clf = ensemble.RandomForestClassifier(n_estimators=self.p['n_estimators'],
                  criterion=self.p['criterion'],
                  max_features=self.p['max_features'],
                  max_depth=self.p['max_depth'],
                  min_samples_split=self.p['min_samples_split'],
                  min_samples_leaf=self.p['min_samples_leaf'],
                  #bootstrap=self.p['bootstrap'],
                  random_state=self.seed,
                  n_jobs=-1)
        clf.fit(dfit.x, dfit.y[tournament])
        yhat = clf.predict_proba(dpre.x)[:, 1]
        return dpre.ids, yhat

In [13]:
# parameters required for hyper-tuning the model
n_estimators = [100, 200, 400]
criterion = ["gini", "entropy"]
max_features = ["sqrt", "log2"]
max_depth = [5, 10, 20]
min_samples_split = [5, 10]
min_samples_leaf = [1, 2, 4]

In [14]:
# combination of parameters
parameters = {'n_estimators': n_estimators,
              'criterion': criterion,
              'max_features': max_features,
              'max_depth': max_depth,
              'min_samples_split': min_samples_split,
              'min_samples_leaf': min_samples_leaf}

In [9]:
# use grid search cv to find the best parameters
train_data = pd.read_csv(os.path.join(os.getcwd(), "numerai_dataset", "numerai_training_data.csv"), header=0)
X_train = np.array(train_data.loc[:, "feature1":"feature50"])

In [10]:
# list of tournaments
tournaments = ["bernie"]
# , "elizabeth", "jordan", "ken", "charles", "frank", "hillary"

In [11]:
# set the directory to save the submissions
os.chdir(os.path.join(os.getcwd(), "..", "modelling", FOLDER_NAME, MODEL_NAME))

In [12]:
# loop through each tournament and print the input for train and validation
for index in range(0, len(tournaments)):
    # initialize tournament modelling timer
    start = time.time()
    
    # get the tournament name
    tournament = tournaments[index]
    
    # set the target name for the tournament
    target = "target_" + tournament
    
    # set the y train with the target variable
    y_train = train_data.iloc[:, train_data.columns == target].values.reshape(-1,)
    
    print "finding best params for: ", tournament
    clf = model_selection.GridSearchCV(ensemble.RandomForestClassifier(), parameters, scoring="neg_log_loss", cv=3, n_jobs=-1, verbose=2)
    clf.fit(X_train, y_train)
    best_params = clf.best_params_
    print "best params: ", best_params
    
    # create a new random forest model for the tournament
    model = randomforest(best_params, seed=123)
    
    print "training info for: ", tournament
    train = nx.backtest(model, data, tournament, verbosity=1)
    
    print "validation info for: ", tournament
    validation = nx.production(model, data, tournament, verbosity=1)
    
    print "saving validation info submission for: ", tournament
    validation.to_csv(MODEL_NAME + "-" + tournament + ".csv")
    print "done saving validation info"
    
    # end tournament modelling timer
    stop = time.time()
    
    print "model duration (minutes): ", ((stop - start)/(1000*60))%60
    
    print "\n"

finding best params for:  bernie
Fitting 3 folds for each of 216 candidates, totalling 648 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed: 30.3min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed: 254.6min
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed: 998.9min
[Parallel(n_jobs=-1)]: Done 632 tasks      | elapsed: 2536.3min
[Parallel(n_jobs=-1)]: Done 648 out of 648 | elapsed: 2657.8min finished


best params:  {'min_samples_leaf': 4, 'n_estimators': 200, 'min_samples_split': 10, 'criterion': 'gini', 'max_features': 'log2', 'max_depth': 5}
training info for:  bernie
randomforest(min_samples_leaf=4, n_estimators=200, min_samples_split=10, criterion=gini, max_features=log2, max_depth=5)
       logloss     auc     acc    ystd   stats        
mean  0.692653  0.5181  0.5124  0.0121   tourn  bernie
std   0.001023  0.0244  0.0185  0.0004  region   train
min   0.689346  0.4573  0.4669  0.0110    eras     120
max   0.695414  0.5960  0.5678  0.0130  consis   0.625
validation info for:  bernie
randomforest(min_samples_leaf=4, n_estimators=200, min_samples_split=10, criterion=gini, max_features=log2, max_depth=5)
       logloss     auc     acc    ystd   stats            
mean  0.692564  0.5205  0.5144  0.0116   tourn      bernie
std   0.000721  0.0168  0.0127  0.0001  region  validation
min   0.691389  0.4892  0.4882  0.0114    eras          12
max   0.693949  0.5472  0.5346  0.0118  consis