In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from datetime import datetime

In [None]:
def loadTrainAndTestDatasets():
        train_df = pd.read_csv("data/numerai_training_data.csv")
        test_df = pd.read_csv("data/numerai_tournament_data.csv")
        
        return train_df, test_df

In [None]:
def prepareDatasets(train_df, test_df):
    
    print('-- Transformation step has begun --- ')
    
    train_outcome = train_df["target"]
    
    train = train_df
    train.drop(["target"], axis=1, inplace=True)
    
    test = test_df
    test.drop(["t_id"], axis=1, inplace=True)
    
    print('-- Transformation step has finished --- ')
    
    return train_outcome, train, test

## Cross-validation

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split
from sklearn.metrics import classification_report, accuracy_score, log_loss

def calculateCVMetrics(train, train_outcome, model):
    
    X_train, X_val, y_train, y_val = train_test_split(train, train_outcome, test_size=0.3)
    model.fit(X_train, y_train)
    clf_probs = model.predict_proba(X_val)
    print('Log Loss metric')
    print(log_loss(y_val, clf_probs))

## Create predictions and submission file

In [None]:
def makePredictions(model, test):
    return model.predict_proba(test)

In [None]:
def createCSVSubmissionFile(predictions, fileName):
    
    results = pd.read_csv("submissions/example_predictions.csv")
    
    results['probability'] = predictions[:,1]
    results.to_csv("submissions/" + fileName, index=False)

# RUN ALL STEPS

In [None]:
train_df, test_df = loadTrainAndTestDatasets()

In [None]:
train_outcome, train, test = prepareDatasets(train_df, test_df)

In [None]:
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
modelXGB = XGBClassifier(n_estimators=5000, learning_rate = 0.001, max_depth=6, subsample=0.7, 
        #colsample_bytree = 0.7, # gamma = 0.7, # max_delta_step=0.1, 
        #reg_lambda = 4, # min_child_weight=50, 
        #seed = seed, 
                        ) 
#modelXGB = XGBClassifier() 
    
modelXGB.fit(train, train_outcome, eval_metric='mlogloss',)
calculateCVMetrics(train, train_outcome, modelXGB)

In [None]:
predictionsXGBoost = makePredictions(modelXGB, test)

In [None]:
predictionsXGBoost[:10]

In [None]:
from datetime import datetime
now = datetime.now().strftime("%Y%m%d-%H%M%S")
fileName = "submission_" + now + ".csv"
createCSVSubmissionFile(predictionsXGBoost, fileName)

In [None]:
train.head()

In [None]:
t = pd.read_csv("data/numerai_tournament_data.csv")

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression, SGDClassifier
classifiers = [
    ##KNeighborsClassifier(100), #0.695473316697
    ##SVC(max_iter=1000, probability=True, kernel='rbf', degree=20), #0.693252664625
    ##DecisionTreeClassifier(max_depth=3),   #0.692021496037
    RandomForestClassifier(max_depth=5, n_estimators=100), #0.69200677508
    ##AdaBoostClassifier(), #0.693078393931
    ##GaussianNB(), #0.700521295321
    ##QuadraticDiscriminantAnalysis(), #0.695140853024
    ##LogisticRegression(),  #0.691223229661
    #LogisticRegression()
    ##GradientBoostingClassifier() #0.691897373977
    ##GradientBoostingClassifier(learning_rate=0.05, min_samples_split=50, max_depth=8) #0.694131149586
    # GradientBoostingClassifier(learning_rate=0.005, n_estimators=3000, min_samples_split=600, min_samples_leaf=30, max_depth=12, subsample=0.85)
]

In [None]:
for classifier in classifiers:
    
    model = classifier
    
    model.fit(train, train_outcome)
    calculateCVMetrics(train, train_outcome, model)

In [None]:
from sklearn.grid_search import GridSearchCV
rfc = RandomForestClassifier(n_jobs=4)
param_grid = { 
    'n_estimators': [100, 500, 1000],
    'max_depth' : [3, 5, 8]
}
model = GridSearchCV(estimator=rfc, param_grid=param_grid, cv=10, n_jobs=4)

In [None]:
model.fit(train, train_outcome)
calculateCVMetrics(train, train_outcome, model)


In [None]:
predictions = makePredictions(model, test)

from datetime import datetime
now = datetime.now().strftime("%Y%m%d-%H%M%S")
fileName = "submission_" + now + ".csv"
createCSVSubmissionFile(predictions, fileName)

In [None]:
model.best_estimator_

### TESTING FEATURE SELECTION

In [None]:
model.feature_importances_

In [None]:
from sklearn.feature_selection import SelectFromModel
new_model = SelectFromModel(model, prefit=True)
X_new = new_model.transform(train)
new_model.fit(X_new, train_outcome)
calculateCVMetrics(X_new, train_outcome, new_model)


In [None]:
Y_new = new_model.transform(test)
predictions = makePredictions(new_model, Y_new)

from datetime import datetime
now = datetime.now().strftime("%Y%m%d-%H%M%S")
fileName = "submission_" + now + ".csv"
createCSVSubmissionFile(predictions, fileName)