In [1]:
import pandas as pd
import numpy as np
from sklearn import metrics, preprocessing, linear_model
import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data
import seaborn as sns
import matplotlib.pyplot as plt


#simple dataset
class SimpleDataSet(object):
    def __init__(self, data, label, batch_size):
        self._data = data
        self._label = label
        self._start = 0
        self._batch_size = batch_size
        assert (len(self._data) == len(self._label))
        self._size = len(self._data)

    def new_epoch(self):
        self._start = 0

    def next_batch(self):
        start = self._start
        self._start = start + self._batch_size
        if self._start >= self._size:
            self._start = self._size - 1
        return self._data[start:self._start], self._label[start:self._start]

    def total_batch(self):
        if len(self._data) % self._batch_size == 0:
            return len(self._data) / self._batch_size
        return len(self._data) / self._batch_size + 1

In [23]:
# convert numerai input to tfrecords
# tfrecords_fn="numerai.train.tfrecords"
# Load the data from the CSV files
training_data = pd.read_csv(
    \'../../nb/numerai1/numerai_training_data_.csv\', header=0)
prediction_data = pd.read_csv(
    \'../../nb/numerai1/numerai_tournament_data.csv\', header=0)

total_data = training_data.copy()
training_data = total_data.sample(frac=0.8, random_state=1)
test_data = total_data.loc[~total_data.index.isin(training_data.index)]

# Transform the loaded CSV data into numpy arrays
feas_data = training_data.drop(\'target\', axis=1)
label_data = training_data[\'target\']
test_feas_data = test_data.drop(\'target\', axis=1)
test_label_data = test_data[\'target\']
tid_data = prediction_data[\'t_id\']
tour_data = prediction_data.drop(\'t_id\', axis=1)
_train_feas = feas_data.values
_train_label = label_data.values
_test_feas = test_feas_data.values
_test_label = test_label_data.values
_tour_feas = tour_data.values
_tid = tid_data.values

onehot_option = 1
normal_option = 0

_train_onehot_label = []
_test_onehot_label = []
onehot_op = tf.one_hot(
    indices=_train_label, on_value=1., off_value=0., depth=2)
test_onehot_op = tf.one_hot(
    indices=_test_label, on_value=1., off_value=0., depth=2)
with tf.Session() as sess:
    if onehot_option is 1:
        _train_onehot_label = onehot_op.eval()  #
        _test_onehot_label = test_onehot_op.eval()
    else:
        _train_onehot_label = np.reshape(_train_label, (-1, 1))
        _test_onehot_label = np.reshape(_test_label, (-1, 1))


def feature_normalize(features):
    mu = np.mean(features, axis=0)
    sigma = np.std(features, axis=0)
    return (features - mu) / sigma


if normal_option is 1:
    _train_feas = feature_normalize(_train_feas)
    _tour_feas = feature_normalize(_tour_feas)

In [21]:
test_data.shape

(27315, 51)

In [26]:
training_data.corr()

Unnamed: 0,feature1,feature2,feature3,feature4,feature5,feature6,feature7,feature8,feature9,feature10,...,feature42,feature43,feature44,feature45,feature46,feature47,feature48,feature49,feature50,target
feature1,1.0,-0.547752,0.188399,0.012195,0.27194,-0.115925,0.464401,0.038626,-0.069747,0.401728,...,-0.301285,-0.403912,0.4579,0.190477,-0.1041,0.060931,0.399299,-0.188582,0.202357,0.010096
feature2,-0.547752,1.0,-0.385038,-0.027149,-0.250862,0.293933,-0.139284,0.491165,-0.031156,-0.22935,...,0.470873,-0.02808,-0.268174,-3.2e-05,-0.34495,0.125439,-0.223892,-0.243075,-0.268161,-0.012585
feature3,0.188399,-0.385038,1.0,0.141428,0.547272,-0.574971,0.232475,-0.057986,-0.607304,0.312914,...,-0.373919,0.132449,0.005468,0.292331,0.149504,0.348877,0.503185,-0.056034,0.129323,0.03538
feature4,0.012195,-0.027149,0.141428,1.0,0.472861,-0.132787,-0.081232,-0.596228,-0.090177,-0.388336,...,-0.04697,0.718092,0.126124,0.500078,0.282742,0.04697,-0.042374,0.660119,0.635071,0.009333
feature5,0.27194,-0.250862,0.547272,0.472861,1.0,0.000291,0.433613,-0.296846,-0.386862,0.454504,...,-0.120184,0.364581,-0.15821,0.795651,0.363971,0.672749,0.371995,0.220235,0.205222,0.036046
feature6,-0.115925,0.293933,-0.574971,-0.132787,0.000291,1.0,-0.09693,0.177762,-0.008506,0.04929,...,0.311639,-0.015387,-0.424147,0.059797,-0.210808,0.274445,-0.268461,-0.141161,-0.157253,-0.017276
feature7,0.464401,-0.139284,0.232475,-0.081232,0.433613,-0.09693,1.0,0.106962,-0.086672,0.664223,...,-0.207286,-0.323663,0.105679,0.449574,0.105143,0.440815,0.275172,-0.205968,-0.160077,0.032228
feature8,0.038626,0.491165,-0.057986,-0.596228,-0.296846,0.177762,0.106962,1.0,-0.18418,0.212485,...,0.245033,-0.611339,-0.084469,-0.150242,-0.609417,0.095215,0.021948,-0.724489,-0.390877,-0.006604
feature9,-0.069747,-0.031156,-0.607304,-0.090177,-0.386862,-0.008506,-0.086672,-0.18418,1.0,-0.345872,...,0.383006,-0.123735,0.214562,-0.157244,0.382881,-0.650355,-0.381149,0.336423,0.029528,-0.015055
feature10,0.401728,-0.22935,0.312914,-0.388336,0.454504,0.04929,0.664223,0.212485,-0.345872,1.0,...,-0.357903,-0.327708,-0.076684,0.248379,-0.00352,0.703493,0.571077,-0.375442,-0.494857,0.023206


In [None]:
sns.pairplot(training_data.head(200)[["feature1","feature2","feature3","feature4","feature5","feature6"]])
plt.show()

In [38]:
learning_rate = 0.1
training_epochs = 200
batch_size = 200
display_step = 1
batch = tf.Variable(0, trainable=False)
learning_rate_op = tf.train.exponential_decay(
    0.02 * 0.01,  # Base learning rate.
    batch * batch_size,  # Current index into the dataset.
    _train_feas.shape[0],  # Decay step.
    0.96,  # Decay rate.
    staircase=True)
# tf Graph Input
x = tf.placeholder(tf.float32, [None, 50])
y = tf.placeholder(tf.float32, [None, 1])
b = tf.Variable(tf.zeros([1]))
W = tf.Variable(tf.zeros([50, 1]))
pred_op = tf.nn.sigmoid(tf.matmul(x, W) + b)
if onehot_option is 1:
    y = tf.placeholder(tf.float32, [None, 2])
    b = tf.Variable(tf.zeros([2]))
    W = tf.Variable(tf.zeros([50, 2]))
    pred_op = tf.nn.softmax(tf.matmul(x, W) + b)
# cost_op=tf.reduce_mean(tf.reduce_sum(- y * tf.log(pred_op) - (1 - y) * tf.log(1 - pred_op), #                                      reduction_indices=[1])+ tf.nn.l2_loss(W)+tf.nn.l2_loss(b))
cost_op = tf.reduce_mean(
    tf.reduce_sum(-y * tf.log(pred_op), reduction_indices=[1]) + tf.nn.l2_loss(
        W) + tf.nn.l2_loss(b))
#cost_op = tf.reduce_mean(tf.reduce_sum(-y*tf.log(pred_op), 1))+ 0.01*tf.nn.l2_loss(W) + 0.01*tf.nn.l2_loss(b) 
#cost_op = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(y,pred_op))+ 0.01*tf.nn.l2_loss(W) + 0.01*tf.nn.l2_loss(b)
optimizer_op = tf.train.GradientDescentOptimizer(learning_rate).minimize(
    cost_op)
# optimizer_op = tf.train.AdamOptimizer(learning_rate).minimize(cost_op)#,global_step=batch)
init_op = tf.global_variables_initializer()
# Launch the graph
result = []
with tf.Session() as sess:
    sds = SimpleDataSet(_train_feas, _train_onehot_label, batch_size)
    total_batch = sds.total_batch()
    sess.run(init_op)
    # Training cycle
    for epoch in range(training_epochs):
        sds.new_epoch()
        avg_cost = 0.
        # Loop over all batches
        for i in range(total_batch):
            X, Y = sds.next_batch()
            _, c = sess.run([optimizer_op, cost_op], feed_dict={x: X, y: Y})
            avg_cost += c / total_batch
#         _, c = sess.run([optimizer_op, cost_op], feed_dict={x: _train_feas,
#                                                                   y: _train_onehot_label})
        correct_prediction_op = tf.equal(
            tf.argmax(pred_op, 1), tf.argmax(y, 1))
        accuracy_op = tf.reduce_mean(
            tf.cast(correct_prediction_op, tf.float32))
        if (epoch + 1) % display_step == 0:
            print "Epoch:", \'%04d\' % (epoch + 1), "cost=", "{:.9f}".format(
                avg_cost), "accuracy:", accuracy_op.eval({
                    x: _test_feas,
                    y: _test_onehot_label
                })
        #start to gen tour data
    result = pred_op.eval({x: _tour_feas})
    print "Finished!"
result

Epoch: 0001 cost= 0.694324794 accuracy: 0.498078
Epoch: 0002 cost= 0.694322739 accuracy: 0.498078
Epoch: 0003 cost= 0.694322739 accuracy: 0.498078
Epoch: 0004 cost= 0.694322739 accuracy: 0.498078


KeyboardInterrupt: 

In [25]:
results_df = pd.DataFrame(data={'t_id':_tid,'probability':result[:,1]})
results_df.to_csv("../../nb/numerai1/predictions.csv", index=False,columns=['t_id','probability'])


In [None]:
train_tfrecords_fn="../../nb/numerai1/numerai/numerai_training_data.csv.tfrecords"
writer = tf.python_io.TFRecordWriter(train_tfrecords_fn)
r=20
for i in training_data.itertuples():
    feas=[]
    for idx in range(1,51):
        feas.append((i[idx]))
    label=i[51]
    print feas,label
    example = tf.train.Example(features=tf.train.Features(feature={
        'label': tf.train.Feature(float_list=tf.train.FloatList(value=[label])),
        'feas': tf.train.Feature(float_list=tf.train.FloatList(value=feas))
    }))
    writer.write(example.SerializeToString())   
    r-=1
    if r<=0:
        break
writer.close()

In [None]:
#try to read
r=2
for serialized_example in tf.python_io.tf_record_iterator(train_tfrecords_fn):
    example = tf.train.Example()
    example.ParseFromString(serialized_example)
    print example.features.feature['feas'].float_list.value
    print example.features.feature['label'].float_list.value
    r-=1
    if r<=0:
        break

In [None]:
train_tfrecords_fn="./numerai/numerai_training_data.csv.tfrecords"
def read_and_decode(filename):
    filename_queue = tf.train.string_input_producer([filename])
    reader = tf.TFRecordReader()
    _, serialized_example = reader.read(filename_queue) 
    features = tf.parse_single_example(serialized_example,
                                       features={
                                           'label': tf.FixedLenFeature([], tf.float32),
                                           'feas' : tf.FixedLenFeature([], tf.float32),
                                       })

    feas=features['feas']
    label = tf.cast(features['label'], tf.int32)
    return feas, label
feas,label=read_and_decode(train_tfrecords_fn)

In [None]:
learning_rate = 0.01
training_epochs = 25
batch_size = 200
display_step = 1

feas_batch, label_batch = tf.train.batch([feas, label],
                                                batch_size=batch_size, capacity=2000)
                                                #min_after_dequeue=1000)
total_batch=int(len(training_data.index)/batch_size)

In [None]:
with tf.Session() as sess:
    f,l=sess.run([feas_batch,label_batch])
    print f,l

In [None]:
# tf Graph Input
x = tf.placeholder(tf.float32, [None, 50]) #50feas
y = tf.placeholder(tf.float32, [None, 1]) #0-1

W = tf.Variable(tf.zeros([50, 1]))
b = tf.Variable(tf.zeros([1]))

pred = tf.nn.sigmoid(tf.matmul(x, W) + b) # sigmoid
cost = tf.reduce_mean(-tf.reduce_sum(y*tf.log(pred), reduction_indices=1))
# cost = tf.nn.l2_loss(pred-y,name="squared_error_cost")
optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost)

In [None]:

init = tf.global_variables_initializer()

# Launch the graph
with tf.Session() as sess:
    sess.run(init)
    # Training cycle
    for epoch in range(training_epochs):
        print "epoch:",epoch
        avg_cost = 0.
        # Loop over all batches
        for i in range(total_batch):
            print "loop:",i
            _, c = sess.run([optimizer, cost], feed_dict={x: feas_batch.eval(),
                                                          y: label_batch.eval()})
            # Compute average loss
            avg_cost += c / total_batch
            print "avg_cost:",avg_cost
        # Display logs per epoch step
        if (epoch+1) % display_step == 0:
            print "Epoch:", '%04d' % (epoch+1), "cost=", "{:.9f}".format(avg_cost)

    print "Optimization Finished!"

    # Test model
    correct_prediction = tf.equal(tf.argmax(pred, 1), tf.argmax(y, 1))
    # Calculate accuracy
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
# print("Accuracy:", accuracy.eval({x: mnist.test.images, y: mnist.test.labels}))


In [None]:
# Set seed for reproducibility
np.random.seed(0)

print("Loading data...")
# Load the data from the CSV files
training_data = pd.read_csv('./numerai/numerai_training_data.csv', header=0)
prediction_data = pd.read_csv('./numerai/numerai_tournament_data.csv', header=0)

# Transform the loaded CSV data into numpy arrays
Y = training_data['target']
X = training_data.drop('target', axis=1)
t_id = prediction_data['t_id']
x_prediction = prediction_data.drop('t_id', axis=1)

# This is your model that will learn to predict
model = linear_model.LogisticRegression(n_jobs=-1)

print("Training...")
# Your model is trained on the numerai_training_data
model.fit(X, Y)

print("Predicting...")
# Your trained model is now used to make predictions on the numerai_tournament_data
# The model returns two columns: [probability of 0, probability of 1]
# We are just interested in the probability that the target is 1.
y_prediction = model.predict_proba(x_prediction)
results = y_prediction[:, 1]
results_df = pd.DataFrame(data={'probability':results})
joined = pd.DataFrame(t_id).join(results_df)

print("Writing predictions to predictions.csv")
# Save the predictions out to a CSV file
joined.to_csv("predictions.csv", index=False)
# Now you can upload these predictions on numer.ai