In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from datetime import datetime

In [2]:
def loadTrainAndTestDatasets():
        train_df = pd.read_csv("data/numerai_training_data.csv")
        test_df = pd.read_csv("data/numerai_tournament_data.csv")
        
        return train_df, test_df

In [3]:
def prepareDatasets(train_df, test_df):
    
    print('-- Transformation step has begun --- ')
    
    train_outcome = train_df["target"]
    
    train = train_df
    train.drop(["target"], axis=1, inplace=True)
    
    test = test_df
    test.drop(["t_id"], axis=1, inplace=True)
    
    print('-- Transformation step has finished --- ')
    
    return train_outcome, train, test

## Cross-validation

In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split
from sklearn.metrics import classification_report, accuracy_score, log_loss

def calculateCVMetrics(train, train_outcome, model):
    
    X_train, X_val, y_train, y_val = train_test_split(train, train_outcome, test_size=0.3)
    model.fit(X_train, y_train)
    clf_probs = model.predict_proba(X_val)
    print('Log Loss metric')
    print(log_loss(y_val, clf_probs))

## Create predictions and submission file

In [5]:
def makePredictions(model, test):
    return model.predict_proba(test)

In [6]:
def createCSVSubmissionFile(predictions, fileName):
    
    results = pd.read_csv("submissions/example_predictions.csv")
    
    results['probability'] = predictions[:,1]
    results.to_csv("submissions/" + fileName, index=False)

# RUN ALL STEPS

In [7]:
train_df, test_df = loadTrainAndTestDatasets()

In [8]:
train_outcome, train, test = prepareDatasets(train_df, test_df)

-- Transformation step has begun --- 
-- Transformation step has finished --- 


In [31]:
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
modelXGB = XGBClassifier(n_estimators=5000, learning_rate = 0.001, max_depth=6, subsample=0.7, 
        #colsample_bytree = 0.7, # gamma = 0.7, # max_delta_step=0.1, 
        #reg_lambda = 4, # min_child_weight=50, 
        #seed = seed, 
                        ) 
#modelXGB = XGBClassifier() 
    
modelXGB.fit(train, train_outcome, eval_metric='mlogloss',)
calculateCVMetrics(train, train_outcome, modelXGB)

Log Loss metric
0.692719283598


In [32]:
predictionsXGBoost = makePredictions(modelXGB, test)

In [33]:
predictionsXGBoost[:10]

array([[ 0.46221948,  0.53778052],
       [ 0.44475436,  0.55524564],
       [ 0.42980969,  0.57019031],
       [ 0.50341177,  0.4965882 ],
       [ 0.42915457,  0.57084543],
       [ 0.46705002,  0.53294998],
       [ 0.44288737,  0.55711263],
       [ 0.474774  ,  0.525226  ],
       [ 0.56255424,  0.43744573],
       [ 0.52497959,  0.47502044]], dtype=float32)

In [34]:
from datetime import datetime
now = datetime.now().strftime("%Y%m%d-%H%M%S")
fileName = "submission_" + now + ".csv"
createCSVSubmissionFile(predictionsXGBoost, fileName)

In [None]:
train.head()

In [None]:
t = pd.read_csv("data/numerai_tournament_data.csv")

In [39]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression, SGDClassifier
classifiers = [
    ##KNeighborsClassifier(100), #0.695473316697
    ##SVC(max_iter=1000, probability=True, kernel='rbf', degree=20), #0.693252664625
    ##DecisionTreeClassifier(max_depth=3),   #0.692021496037
    RandomForestClassifier(max_depth=5, n_estimators=100), #0.69200677508
    ##AdaBoostClassifier(), #0.693078393931
    ##GaussianNB(), #0.700521295321
    ##QuadraticDiscriminantAnalysis(), #0.695140853024
    ##LogisticRegression(),  #0.691223229661
    #LogisticRegression()
    ##GradientBoostingClassifier() #0.691897373977
    ##GradientBoostingClassifier(learning_rate=0.05, min_samples_split=50, max_depth=8) #0.694131149586
    # GradientBoostingClassifier(learning_rate=0.005, n_estimators=3000, min_samples_split=600, min_samples_leaf=30, max_depth=12, subsample=0.85)
]

In [40]:
for classifier in classifiers:
    
    model = classifier
    
    model.fit(train, train_outcome)
    calculateCVMetrics(train, train_outcome, model)

Log Loss metric
0.69169577635


In [46]:
from sklearn.grid_search import GridSearchCV
rfc = RandomForestClassifier(n_jobs=4)
param_grid = { 
    'n_estimators': [100, 500, 1000],
    'max_depth' : [3, 5, 8]
}
model = GridSearchCV(estimator=rfc, param_grid=param_grid, cv=10, n_jobs=4)

In [None]:
model.fit(train, train_outcome)
calculateCVMetrics(train, train_outcome, model)


[Parallel(n_jobs=1)]: Done  49 tasks       | elapsed:    3.2s
[Parallel(n_jobs=1)]: Done  49 tasks       | elapsed:    3.4s
[Parallel(n_jobs=1)]: Done  49 tasks       | elapsed:    3.6s
[Parallel(n_jobs=1)]: Done  49 tasks       | elapsed:    3.6s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    7.3s finished
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    7.3s finished
[Parallel(n_jobs=1)]: Done  49 tasks       | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks       | elapsed:    0.1s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    7.5s finished
[Parallel(n_jobs=1)]: Done  49 tasks       | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    7.6s finished
[Parallel(n_jobs=1)]: Done  49 tasks       | elapsed:    0.0s
[Parallel(n_jo

In [45]:
predictions = makePredictions(model, test)

from datetime import datetime
now = datetime.now().strftime("%Y%m%d-%H%M%S")
fileName = "submission_" + now + ".csv"
createCSVSubmissionFile(predictions, fileName)

In [None]:
model.best_estimator_

### TESTING FEATURE SELECTION

In [44]:
model.feature_importances_

array([ 0.0339017 ,  0.05916888,  0.02903521,  0.03310291,  0.03963424,
        0.03137447,  0.0303895 ,  0.02958126,  0.02954537,  0.05504999,
        0.06498488,  0.03716954,  0.04618844,  0.06361075,  0.08634508,
        0.08078569,  0.07468124,  0.032105  ,  0.0275825 ,  0.05248419,
        0.06327914])

In [None]:
from sklearn.feature_selection import SelectFromModel
new_model = SelectFromModel(model, prefit=True)
X_new = new_model.transform(train)
new_model.fit(X_new, train_outcome)
calculateCVMetrics(X_new, train_outcome, new_model)


In [None]:
Y_new = new_model.transform(test)
predictions = makePredictions(new_model, Y_new)

from datetime import datetime
now = datetime.now().strftime("%Y%m%d-%H%M%S")
fileName = "submission_" + now + ".csv"
createCSVSubmissionFile(predictions, fileName)