In [1]:
import tensorflow as tf
import numpy as np
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import mean_squared_error, precision_score
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [2]:
train_dataset = np.genfromtxt('resources/train.csv', dtype=np.float32, delimiter=',')
train_targets = train_dataset[:,-1]
train_targets = np.array([1 if x == 2 else 0 for x in train_targets])
test_dataset = np.genfromtxt('resources/test.csv', dtype=np.float32, delimiter=',')
test_targets = test_dataset[:,-1]
test_targets = np.array([1 if x == 2 else 0 for x in test_targets])
combined_dataset=np.concatenate((train_dataset[:,:-1], test_dataset[:,:-1]))

In [3]:
np.bincount(train_targets)

array([58630, 67343])

In [4]:
numeric_indices = []
for i in range(combined_dataset.shape[1]):
    n_unique = len(np.unique(combined_dataset[:, i]))
    numeric_indices.append(n_unique > 2)
print (len(numeric_indices))
numeric_indices = np.array(numeric_indices, dtype=np.bool)

122


In [5]:
combined_dataset[:, numeric_indices] = StandardScaler().fit_transform(
                                        combined_dataset[:, numeric_indices]
                                        )

In [6]:
def accuracy(predictions, labels):
  return (100.0 * np.sum(np.argmax(predictions, 1) == labels)
          / predictions.shape[0])

In [7]:
batch_size = 1500
num_feature = 122
num_labels = 2
graph = tf.Graph()
with graph.as_default():
    target = tf.placeholder(tf.float32, shape=(None, 2))
    n_input = num_feature
    n_hidden_1 = 200
    n_hidden_2 = 50
    n_hidden_3 = 2
    
    inputs = {'l1': tf.placeholder(tf.float32, shape=(None, num_feature)),
              'l2': tf.placeholder(tf.float32, shape=(None, n_hidden_1)),
              'l3': tf.placeholder(tf.float32, shape=(None, n_hidden_2))}
    
    targets = {'l1': tf.placeholder(tf.float32, shape=(None, num_feature)),
               'l2': tf.placeholder(tf.float32, shape=(None, n_hidden_1)),
               'l3': tf.placeholder(tf.float32, shape=(None, num_labels))}

    weights = {'l1': tf.Variable(tf.truncated_normal([n_input, n_hidden_1], dtype=tf.float32)),
               'l2': tf.Variable(tf.truncated_normal([n_hidden_1, n_hidden_2], dtype=tf.float32)),
               'l3': tf.Variable(tf.truncated_normal([n_hidden_2, num_labels], dtype=tf.float32))}

    enc_biases = {'l1': tf.Variable(tf.truncated_normal([n_hidden_1], dtype=tf.float32)),
                  'l2': tf.Variable(tf.truncated_normal([n_hidden_2], dtype=tf.float32)),
                  'l3': tf.Variable(tf.truncated_normal([num_labels], dtype=tf.float32))}
    
    dec_biases = {'l1': tf.Variable(tf.truncated_normal([n_input], dtype=tf.float32)),
                  'l2': tf.Variable(tf.truncated_normal([n_hidden_1], dtype=tf.float32))}
    
    def encode(layer_id):
        return tf.nn.relu(tf.add(tf.matmul(inputs[layer_id], weights[layer_id]), enc_biases[layer_id]))
    
    def decode(layer, layer_id):
        return tf.add(tf.matmul(layer, tf.transpose(weights[layer_id])), dec_biases[layer_id])
    
    def get_loss(inp, out):
        difference = tf.sub(inp, out) 
        return tf.sqrt(tf.reduce_mean(tf.square(difference)))   

    def encoder_loss(layer_id):
        enc = encode(layer_id)
        dec = decode(enc, layer_id)
        loss = get_loss(targets[layer_id], dec)
        return loss
    
    def logit_loss(logits, layer_id):
        return tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits, targets[layer_id]))
    
    def full_compute():
        logits = inputs['l1']
        for layer_id in ['l1', 'l2', 'l3']:
            logits = tf.nn.relu(tf.add(tf.matmul(logits, weights[layer_id]), enc_biases[layer_id]))
        return logits
        
    def get_sigmoid(logits):
        return tf.nn.sigmoid(logits)
    
    loss_l1 = encoder_loss('l1')
    optimizer_l1 = tf.train.AdamOptimizer(0.001).minimize(loss_l1)
    loss_l2 = encoder_loss('l2')
    optimizer_l2 = tf.train.AdamOptimizer(0.001).minimize(loss_l2)
    logits = full_compute()
    loss_l3 = logit_loss(logits, 'l3')
    optimizer_l3 = tf.train.AdamOptimizer(0.001).minimize(loss_l3 + 0.001*tf.nn.l2_loss(weights['l3']))

In [10]:
randomized_dataset = combined_dataset.copy()
# np.random.shuffle(randomized_dataset)
preds = []
with tf.Session(graph=graph) as session:
    tf.initialize_all_variables().run()
    print ("Training First Layer")
    num_steps = 200
    for step in range(1, num_steps + 1):
        offset = (step * batch_size) % (randomized_dataset.shape[0] - batch_size)
        batch_data = randomized_dataset[offset:(offset + batch_size), :]
        noisy_batch_data = batch_data.copy()
        for i, point in enumerate(noisy_batch_data):
            noisy_batch_data[i, :] = point + np.random.normal(0, 1, noisy_batch_data.shape[1])
        feed_dict = {inputs['l1']: noisy_batch_data, targets['l1']: batch_data}
        _, l = session.run([optimizer_l1, loss_l1], feed_dict=feed_dict)
        if step % 50 == 0:
            print step, ':', l
    encoder_op = encode('l1')
    encoded_randomized_dataset = encoder_op.eval(feed_dict={inputs['l1']: randomized_dataset})
    num_steps = 200
    print ("Training Second Layer")
    for step in range(1, num_steps + 1):
        offset = (step * batch_size) % (randomized_dataset.shape[0] - batch_size)
        batch_data = encoded_randomized_dataset[offset:(offset + batch_size), :]
        noisy_batch_data = batch_data.copy()
        for i, point in enumerate(noisy_batch_data):
            noisy_batch_data[i, :] = point + np.random.normal(0, 1, noisy_batch_data.shape[1])
        feed_dict = {inputs['l2']: noisy_batch_data, targets['l2']: batch_data}
        _, l = session.run([optimizer_l2, loss_l2], feed_dict=feed_dict)
        if step % 50 == 0:
            print step, ':', l
    encoded_dataset = combined_dataset[:train_dataset.shape[0], :]
    num_steps = 400
    print ("Training Thrid Layer")
    for step in range(1, num_steps + 1):
        offset = (step * batch_size) % (encoded_dataset.shape[0] - batch_size)
        batch_data = encoded_dataset[offset:(offset + batch_size), :]
        batch_labels = list(train_targets[offset:(offset + batch_size)])
        for i, label in enumerate(batch_labels):
            batch_labels[i] = (label == np.arange(2)).astype(np.int32)
        batch_labels = np.array(batch_labels, dtype=np.int32)
        feed_dict = {inputs['l1']: batch_data, targets['l3']: batch_labels}
        _, l = session.run([optimizer_l3, loss_l3], feed_dict=feed_dict)
        if step % 100 == 0:
            print step, ':', l
    logit_op = full_compute()
    softmaxes = get_sigmoid(logit_op)
    preds = softmaxes.eval(feed_dict={inputs['l1']: combined_dataset[train_dataset.shape[0]:, :]})

Training First Layer
50 : 113.205
100 : 101.877
150 : 92.0403
200 : 85.2228
Training Second Layer
50 : 110.068
100 : 83.3341
150 : 57.1189
200 : 43.5317
Training Thrid Layer
100 : 0.463987
200 : 0.415287
300 : 0.3935
400 : 0.393251


In [11]:
accuracy(preds, test_targets)

87.28708303761533

In [12]:
preds = np.array([np.argmax(x) for x in preds])
print confusion_matrix(test_targets, preds)

[[10927  1906]
 [  960  8751]]


In [66]:
import csv
with open('predictions2.csv', 'w') as f:
    writer = csv.writer(f)
    writer.writerows(preds.reshape(-1, 1))