In [1]:
# import standard libraries
import numpy as np
import os as os
import pandas as pd
import time

In [2]:
# import modelling libraries
from sklearn import linear_model, model_selection
import numerox as nx

In [3]:
# set the data working directory
os.chdir(os.path.join(os.getcwd(), "..", "data"))

In [4]:
# download the latest numerai dataset
# data = nx.download("numerai_dataset.zip")

# to make it faster use an existing dataset
data = nx.load_zip("numerai_dataset.zip")

In [5]:
# environment settings
MODEL_NAME = "logistic-regression"
FOLDER_NAME = "submission"

In [6]:
# extend the logistic model class offered by numerox
class logistic(nx.Model):

    def __init__(self, params):
        self.p = params

    def fit_predict(self, dfit, dpre, tournament):
        model = linear_model.LogisticRegression(C=self.p['C'], 
                                                solver=self.p['solver'], 
                                                multi_class=self.p['multi_class'])
        model.fit(dfit.x, dfit.y[tournament])
        yhat = model.predict_proba(dpre.x)[:, 1]
        return dpre.ids, yhat

In [7]:
# parameters required for hyper-tuning the model
C = [0.0001, 0.001, 0.01]
solver = ["newton-cg", "lbfgs", "sag", "saga"]
multi_class = ["ovr", "multinomial", "auto"]

In [8]:
# combination of parameters
parameters = {'C': C,
             'solver': solver,
             'multi_class': multi_class}

In [14]:
# use grid search cv to find the best parameters
train_data = pd.read_csv(os.path.join(os.getcwd(), "numerai_dataset", "numerai_training_data.csv"), header=0)
X = np.array(train_data.loc[:, :"feature50"])
X.shape
y_train = train_data.iloc[:, train_data.columns == "target_bernie"])
y_train

NameError: name 'array' is not defined

In [10]:
# list of tournaments
tournaments = ["bernie", "elizabeth", "jordan", "ken", "charles", "frank", "hillary"]

In [11]:
# set the directory to save the submissions
os.chdir(os.path.join(os.getcwd(), "..", "modelling", FOLDER_NAME, MODEL_NAME))

In [12]:
# define kfold cross validation split
kfold_split = 5

# loop through each tournament and print the input for train and validation
for index in range(0, len(tournaments)):
    # get the tournament name
    tournament = tournaments[index]
    
    print "*********** TOURNAMENT " + tournament + " ***********"
    
    # set the target name for the tournament
    target = "target_" + tournament
    
    # set the y train with the target variable
    y = y_train = train_data.iloc[:, train_data.columns == target].values.reshape(-1,)
    
    # use GroupKFold for splitting the era
    group_kfold = model_selection.GroupKFold(n_splits=kfold_split)
    
    counter = 1
    
    print ">> group eras using kfold split\n"
    
    for train_index, test_index in group_kfold.split(X, y, groups=train_data['era']):
        # X_train takes the 50 features only for training and leave the other columns
        X_train = X[train_index][:,3:]
        # y_train remains the same
        y_train = y[train_index]
        
        print ">> running split #", counter
        
        print ">> finding best params"
        clf = model_selection.GridSearchCV(linear_model.LogisticRegression(), parameters, scoring="neg_log_loss", cv=kfold_split, n_jobs=-1)
        clf.fit(X_train, y_train)
        best_params = clf.best_params_
        print ">> best params: ", best_params

        # create a new logistic regression model for the tournament
        model = logistic(best_params)

        print ">> training info:"
        train = nx.backtest(model, data, tournament, verbosity=1)

        print ">> validation info:"
        validation = nx.production(model, data, tournament, verbosity=1)

        print ">> saving validation info: "
        validation.to_csv(MODEL_NAME + "-" + tournament + "-" + str(counter) + ".csv")
        print ">> done saving validation info"

        print "\n"
        
        counter=counter+1
    

*********** TOURNAMENT bernie ***********
>> group eras using kfold split

>> running split # 1
>> finding best params
>> best params:  {'multi_class': 'ovr', 'C': 0.001, 'solver': 'lbfgs'}
>> training info:
logistic(multi_class=ovr, C=0.001, solver=lbfgs)
       logloss     auc     acc    ystd   stats          
mean  0.692583  0.5187  0.5125  0.0170   tourn    bernie
std   0.001457  0.0251  0.0193  0.0006  region     train
min   0.687834  0.4542  0.4699  0.0156    eras       120
max   0.696270  0.5993  0.5712  0.0191  consis  0.608333
>> validation info:
logistic(multi_class=ovr, C=0.001, solver=lbfgs)
       logloss     auc     acc    ystd   stats            
mean  0.692469  0.5200  0.5144  0.0171   tourn      bernie
std   0.001107  0.0184  0.0156  0.0003  region  validation
min   0.690483  0.4912  0.4875  0.0166    eras          12
max   0.694352  0.5521  0.5442  0.0174  consis        0.75
>> saving validation info: 
>> done saving validation info


>> running split # 2
>> finding b



>> best params:  {'multi_class': 'ovr', 'C': 0.001, 'solver': 'lbfgs'}
>> training info:
logistic(multi_class=ovr, C=0.001, solver=lbfgs)
       logloss     auc     acc    ystd   stats          
mean  0.692583  0.5187  0.5125  0.0170   tourn    bernie
std   0.001457  0.0251  0.0193  0.0006  region     train
min   0.687834  0.4542  0.4699  0.0156    eras       120
max   0.696270  0.5993  0.5712  0.0191  consis  0.608333
>> validation info:
logistic(multi_class=ovr, C=0.001, solver=lbfgs)
       logloss     auc     acc    ystd   stats            
mean  0.692469  0.5200  0.5144  0.0171   tourn      bernie
std   0.001107  0.0184  0.0156  0.0003  region  validation
min   0.690483  0.4912  0.4875  0.0166    eras          12
max   0.694352  0.5521  0.5442  0.0174  consis        0.75
>> saving validation info: 
>> done saving validation info


*********** TOURNAMENT elizabeth ***********
>> group eras using kfold split

>> running split # 1
>> finding best params
>> best params:  {'multi_class



>> best params:  {'multi_class': 'ovr', 'C': 0.01, 'solver': 'lbfgs'}
>> training info:
logistic(multi_class=ovr, C=0.01, solver=lbfgs)
       logloss     auc     acc    ystd   stats          
mean  0.692557  0.5198  0.5134  0.0218   tourn    jordan
std   0.001972  0.0262  0.0194  0.0009  region     train
min   0.688777  0.4300  0.4422  0.0196    eras       120
max   0.699268  0.5711  0.5534  0.0243  consis  0.583333
>> validation info:
logistic(multi_class=ovr, C=0.01, solver=lbfgs)
       logloss     auc     acc    ystd   stats            
mean  0.692648  0.5179  0.5134  0.0212   tourn      jordan
std   0.001264  0.0176  0.0143  0.0004  region  validation
min   0.690445  0.4934  0.4949  0.0205    eras          12
max   0.694627  0.5468  0.5370  0.0216  consis    0.666667
>> saving validation info: 
>> done saving validation info


>> running split # 4
>> finding best params
>> best params:  {'multi_class': 'ovr', 'C': 0.01, 'solver': 'lbfgs'}
>> training info:
logistic(multi_class=ov

       logloss     auc     acc    ystd   stats         
mean  0.692821  0.5144  0.5101  0.0139   tourn  charles
std   0.001061  0.0218  0.0171  0.0009  region    train
min   0.690413  0.4597  0.4660  0.0122    eras      120
max   0.695606  0.5632  0.5509  0.0154  consis    0.625
>> validation info:
logistic(multi_class=ovr, C=0.001, solver=lbfgs)
       logloss     auc     acc    ystd   stats            
mean  0.692372  0.5233  0.5165  0.0138   tourn     charles
std   0.000899  0.0186  0.0138  0.0003  region  validation
min   0.690979  0.4775  0.4858  0.0133    eras          12
max   0.694656  0.5538  0.5417  0.0142  consis    0.833333
>> saving validation info: 
>> done saving validation info


>> running split # 3
>> finding best params
>> best params:  {'multi_class': 'multinomial', 'C': 0.001, 'solver': 'lbfgs'}
>> training info:
logistic(multi_class=multinomial, C=0.001, solver=lbfgs)
       logloss     auc     acc    ystd   stats          
mean  0.692830  0.5144  0.5099  0.0156  

       logloss     auc     acc    ystd   stats         
mean  0.692456  0.5214  0.5157  0.0182   tourn  hillary
std   0.001230  0.0195  0.0149  0.0010  region    train
min   0.689836  0.4583  0.4629  0.0161    eras      120
max   0.696685  0.5635  0.5486  0.0216  consis    0.675
>> validation info:
logistic(multi_class=auto, C=0.01, solver=saga)
       logloss     auc     acc    ystd   stats            
mean  0.692564  0.5192  0.5145  0.0178   tourn     hillary
std   0.000933  0.0153  0.0119  0.0002  region  validation
min   0.691466  0.4860  0.4892  0.0173    eras          12
max   0.694617  0.5397  0.5343  0.0181  consis        0.75
>> saving validation info: 
>> done saving validation info


>> running split # 2
>> finding best params
>> best params:  {'multi_class': 'ovr', 'C': 0.01, 'solver': 'lbfgs'}
>> training info:
logistic(multi_class=ovr, C=0.01, solver=lbfgs)
       logloss     auc     acc    ystd   stats          
mean  0.692458  0.5213  0.5157  0.0182   tourn   hillary
st