In [2]:
import numpy as np 
import pandas as pd 
import scipy as sp 
import matplotlib.pyplot as plt
import seaborn
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support, log_loss
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import VotingClassifier
import random
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelBinarizer, StandardScaler, normalize

%matplotlib inline

# unzip new data

In [3]:
import zipfile
zip_ref = zipfile.ZipFile('data/numerai_datasets.zip', 'r')
zip_ref.extractall('data/')
zip_ref.close()

# Split data

In [4]:
data = pd.read_csv('data/numerai_training_data.csv')
val_data = pd.read_csv('data/numerai_tournament_data.csv')

In [5]:
X_val = val_data.loc[val_data.data_type == 'validation', :].iloc[:, 3:-1]
y_val = np.array(val_data.loc[val_data.data_type == 'validation', :].iloc[:, -1].values, dtype=int)

In [6]:
X_prediction = val_data.iloc[:, 3:-1]

In [7]:
# X_train, X_test, y_train, y_test = train_test_split(data.iloc[:, 3:-1], data.iloc[:, -1])
X_train = data.iloc[:, 3:-1]
y_train = np.array(data.iloc[:, -1], dtype=int)

# Over-sampling

In [8]:
X = data.drop(['era', 'data_type'], axis=1)
Y = data.era.values
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_sample(X, Y)



In [9]:
res_df = pd.DataFrame(X_res, columns=X.columns)
sd = StandardScaler()
res_feature = sd.fit_transform(res_df.iloc[:, 1:-1])
res_label = np.array(res_df.iloc[:, -1], dtype=int)

# training model utilities

In [10]:
def test_clf(grid_search, features, labels, parameters, iterations=100):
    from sklearn.metrics import classification_report
    precision, recall = [], []
    for iteration in range(iterations):
        features_train, features_test, labels_train, labels_test = train_test_split(features, labels, random_state=iteration)
        grid_search.fit(features_train, labels_train)
        predictions = grid_search.predict_proba(features_test)
    print(log_loss(labels_test, predictions[:, 1]))
    best_params = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print('%s=%r, ' % (param_name, best_params[param_name]))

In [11]:
# evalute classifier accuracy 

def clf_eval(clf, X_test, y_test, X_val, y_val):
    y_pred = clf.predict_proba(X_test)[:, 1]
#     precision, recall, fscore, support = precision_recall_fscore_support(y_test, y_pred)
    loss = log_loss(y_test, y_pred)
    print('Test set')
    print('logloss: {}'.format(loss))
#     print('precision \t recall \t fscore \t support \n' +
#           '{} \t {} \t {} \t  {} \n'.format(precision[0], recall[0], fscore[0], support[0]) + 
#           '{} \t {} \t {} \t  {}'.format(precision[1], recall[1], fscore[1], support[1]))
    
    y_pred = clf.predict_proba(X_val)[:, 1]
    loss = log_loss(y_val, y_pred)

#     precision, recall, fscore, support = precision_recall_fscore_support(y_val, y_pred)
    print('\n Validation set')
    print('logloss: {}'.format(loss))
#     print('precision \t recall \t fscore \t support \n' +
#           '{} \t {} \t {} \t  {} \n'.format(precision[0], recall[0], fscore[0], support[0]) + 
#           '{} \t {} \t {} \t  {}'.format(precision[1], recall[1], fscore[1], support[1]))   

# Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

clf1 = DecisionTreeClassifier(random_state=0)
clf1.fit(res_feature, np.array(res_label, dtype=int))

clf_eval(clf1, X_train, y_train, X_val, y_val)

# Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

clf2 = RandomForestClassifier()
parameters = {'n_estimators': [10, 20, 30, 40],
              'max_depth': [5, 10],
              'min_samples_split': [2, 5, 10], 
              'min_impurity_split' : [1e-7,1e-15]}

grid_search2 = GridSearchCV(clf2, parameters)

test_clf(grid_search2, res_feature, res_label, parameters, iterations=1)

# Gradient Booting Trees

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

clf = GradientBoostingClassifier()

parameters = {'n_estimators': [10, 20, 30, 40],
              'max_depth': [5, 10, 20],
              'min_samples_split': [2, 5, 10], 
              'min_impurity_split' : [1e-7,1e-15]}
# 
grid_search3= GridSearchCV(clf, parameters)

test_clf(grid_search3, res_feature, res_label, parameters, iterations=1)

# clf_eval(clf, X_test, y_test, X_val, y_val)

#  Voting Classifier

In [None]:
clf1 = LogisticRegression(random_state=2)
clf2 = grid_search2.estimator
clf3 = GaussianNB()
clf4 = grid_search3.estimator

# eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3), ('gbx', clf4)], voting='soft')
eclf = VotingClassifier(estimators=[('rf', clf2), ('lr', clf1), ('gnb', clf3)], voting='soft')


clf1.fit(res_feature, res_label)
clf2.fit(res_feature, res_label)
clf3.fit(res_feature, res_label)
# clf4.fit(res_feature, res_label)
eclf.fit(res_feature, res_label)
print(log_loss(y_val, eclf.predict_proba(X_val)[:,1]))

# Neural nets

In [33]:
from sklearn.preprocessing import LabelEncoder

enc = LabelEncoder()
a = enc.fit_transform(res_label)

(array([1, 1, 1, ..., 0, 0, 0]), array([1, 1, 1, ..., 0, 0, 0]))

In [36]:
from keras.models import Sequential
from keras.layers import Dense, Activation


model = Sequential()

model.add(Dense(units=64, input_dim=21))
model.add(Activation('relu'))
model.add(Dense(units=1))
model.add(Activation('softmax'))

model.compile(loss='binary_crossentropy',
              optimizer='sgd',
              metrics=['accuracy'])

model.fit(res_feature, res_label, epochs=5, batch_size=32)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f5183d3b320>

In [57]:
pred = model.predict_proba(sd.fit_transform(X_val),)
pred



array([[1],
       [1],
       [1],
       ..., 
       [1],
       [1],
       [1]], dtype=int32)

array([ 0.], dtype=float32)

# make submission 

In [None]:
probability = grid_search2.predict_proba(sd.fit_transform(X_prediction))[:, 1]
submit = pd.DataFrame({'id':val_data.id.values, 'probability': probability})
submit.to_csv('submit.csv', index=False)