In [20]:
import numpy as np 
import pandas as pd 
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint
from time import time

import datetime

print(datetime.datetime.now())

2017-07-23 18:46:34.530183


In [7]:
val_data = pd.read_csv('../data/numerai_tournament_data.csv')
data = pd.read_csv('../data/numerai_training_data.csv')

In [51]:
X = data.iloc[:, 3:-1]
y = data.iloc[:, -1]
era = data.era.values.flatten()
X_val = val_data.iloc[:, 3:-1]

In [35]:
clf = RandomForestClassifier(n_estimators=3, n_jobs=6)

In [36]:
# Utility function to report best scores
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

In [46]:
param_dist = {"max_depth": [3, None],
              "max_features": sp_randint(1, 11),
              "min_samples_split": sp_randint(2, 11),
              "min_samples_leaf": sp_randint(2, 11),
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}

In [47]:
n_folds = 2
skf = StratifiedKFold(n_splits=n_folds, shuffle=True,)
n_models = 1

predictions = []
for j in range(n_models):
    for i, (trainIndx, testIndx) in enumerate(skf.split(X, era)):
        n_iter_search = 1
        random_search = RandomizedSearchCV(clf, param_distributions=param_dist,
                                   n_iter=n_iter_search)

        start = time()
        random_search.fit(X.iloc[trainIndx, :], y.iloc[trainIndx])
        report(random_search.cv_results_)
        predictions.append(random_search.predict_proba(X_val)[:, 1])

Model with rank: 1
Mean validation score: 0.504 (std: 0.000)
Parameters: {'min_samples_split': 6, 'criterion': 'gini', 'max_features': 8, 'max_depth': None, 'bootstrap': True, 'min_samples_leaf': 9}

Model with rank: 1
Mean validation score: 0.500 (std: 0.004)
Parameters: {'min_samples_split': 8, 'criterion': 'entropy', 'max_features': 3, 'max_depth': None, 'bootstrap': False, 'min_samples_leaf': 10}



In [48]:
predictions

[array([ 0.66170658,  0.6984127 ,  0.42222222, ...,  0.50930736,
         0.78479532,  0.31309524]),
 array([ 0.26509288,  0.73083779,  0.14219114, ...,  0.63333333,
         0.23269537,  0.70964912])]

In [49]:
X_val.shape

(108405, 21)