In [1]:
import tensorflow as tf
import numpy as np
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import mean_squared_error, precision_score
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [2]:
train_dataset = np.genfromtxt('resources/train.csv', dtype=np.float32, delimiter=',')
train_targets = train_dataset[:,-1]
train_targets = np.array([1 if x == 2 else 0 for x in train_targets])
test_dataset = np.genfromtxt('resources/test.csv', dtype=np.float32, delimiter=',')
test_targets = test_dataset[:,-1]
test_targets = np.array([1 if x == 2 else 0 for x in test_targets])
combined_dataset=np.concatenate((train_dataset[:,:-1], test_dataset[:,:-1]))

In [3]:
np.bincount(train_targets)

array([58630, 67343])

In [4]:
numeric_indices = []
for i in range(combined_dataset.shape[1]):
    n_unique = len(np.unique(combined_dataset[:, i]))
    numeric_indices.append(n_unique > 2)
print (len(numeric_indices))
numeric_indices = np.array(numeric_indices, dtype=np.bool)

122


In [5]:
combined_dataset[:, numeric_indices] = StandardScaler().fit_transform(
                                        combined_dataset[:, numeric_indices]
                                        )

In [69]:
batch_size = 1500
num_feature = 122
num_labels = 2
graph = tf.Graph()
with graph.as_default():
    target = tf.placeholder(tf.float32, shape=(None, 2))
    n_input = num_feature
    n_hidden_1 = 200
    n_hidden_2 = 100
    n_hidden_3 = 50
    
    inputs = {'l1': tf.placeholder(tf.float32, shape=(None, num_feature)),
              'l2': tf.placeholder(tf.float32, shape=(None, n_hidden_1)),
              'l3': tf.placeholder(tf.float32, shape=(None, n_hidden_2))}
    
    targets = {'l1': tf.placeholder(tf.float32, shape=(None, num_feature)),
               'l2': tf.placeholder(tf.float32, shape=(None, n_hidden_1)),
               'l3': tf.placeholder(tf.float32, shape=(None, num_labels))}

    weights = {'l1': tf.Variable(tf.truncated_normal([n_input, n_hidden_1], dtype=tf.float32)),
               'l2': tf.Variable(tf.truncated_normal([n_hidden_1, n_hidden_2], dtype=tf.float32)),
               'l3': tf.Variable(tf.truncated_normal([n_hidden_2, n_hidden_3], dtype=tf.float32))}

    enc_biases = {'l1': tf.Variable(tf.truncated_normal([n_hidden_1], dtype=tf.float32)),
                  'l2': tf.Variable(tf.truncated_normal([n_hidden_2], dtype=tf.float32)),
                  'l3': tf.Variable(tf.truncated_normal([n_hidden_3], dtype=tf.float32))}
    
    dec_biases = {'l1': tf.Variable(tf.truncated_normal([n_input], dtype=tf.float32)),
                  'l2': tf.Variable(tf.truncated_normal([n_hidden_1], dtype=tf.float32))}
    
    wo = tf.Variable(tf.truncated_normal([n_hidden_3, num_labels], dtype=tf.float32))
    bo = tf.Variable(tf.truncated_normal([num_labels], dtype=tf.float32))
    
    def encode(layer_id):
        return tf.add(tf.matmul(inputs[layer_id], weights[layer_id]), enc_biases[layer_id])
    
    def decode(layer, layer_id):
        return tf.add(tf.matmul(layer, tf.transpose(weights[layer_id])), dec_biases[layer_id])
    
    def get_loss(inp, out):
        difference = tf.sub(inp, out) 
        return tf.sqrt(tf.reduce_mean(tf.square(difference)))   

    def encoder_loss(layer_id):
        enc = encode(layer_id)
        dec = decode(enc, layer_id)
        loss = get_loss(targets[layer_id], dec)
        return loss
    
    def compute_logit(layer_id):
        logits = tf.nn.tanh(encode(layer_id))
        logits = tf.add(tf.matmul(logits, wo), bo)
        return logits
    
    def logit_loss(logits, layer_id):
        return tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits, targets[layer_id]))
    
    def get_softmax(logits):
        return tf.nn.softmax(logits)
    
    loss_l1 = encoder_loss('l1')
    optimizer_l1 = tf.train.AdamOptimizer(0.001).minimize(loss_l1)
    loss_l2 = encoder_loss('l2')
    optimizer_l2 = tf.train.AdamOptimizer(0.001).minimize(loss_l2)
    logits = compute_logit('l3')
    loss_l3 = logit_loss(logits, 'l3')
    optimizer_l3 = tf.train.AdamOptimizer(0.001).minimize(loss_l3)

In [70]:
num_steps = 10000
randomized_dataset = combined_dataset.copy()
np.random.shuffle(randomized_dataset)
with tf.Session(graph=graph) as session:
    tf.initialize_all_variables().run()
    print ("Initialized")
    for step in range(1, num_steps + 1):
        offset = (step * batch_size) % (randomized_dataset.shape[0] - batch_size)
        batch_data = randomized_dataset[offset:(offset + batch_size), :]
        noisy_batch_data = batch_data.copy()
        for i, point in enumerate(noisy_batch_data):
            noisy_batch_data[i, :] = point + np.random.normal(0, 1, noisy_batch_data.shape[1])
        feed_dict = {inputs['l1']: noisy_batch_data, targets['l1']: batch_data}
        _, l = session.run([optimizer_l1, loss_l1], feed_dict=feed_dict)
        if step % 500 == 0:
            print step, ':', l

Initialized
500 : 85.9166
1000 : 36.9498
1500 : 15.7648
2000 : 6.88497
2500 : 3.11645
3000 : 1.48298
3500 : 0.627058
4000 : 0.381959
4500 : 0.323412
5000 : 0.300571
5500 : 0.320765
6000 : 0.309214
6500 : 0.301205
7000 : 0.315248
7500 : 0.301004
8000 : 0.33757
8500 : 0.336385
9000 : 0.299019
9500 : 0.296193
10000 : 0.316098


In [71]:
num_steps = 45000
preds = []
randomized_dataset = combined_dataset.copy()
np.random.shuffle(randomized_dataset)
new_input=[]
with tf.Session(graph=graph) as session:
    tf.initialize_all_variables().run()
    print ("Initialized")
    encoder_op = encode('l1')
    encoded_randomized_dataset = encoder_op.eval(feed_dict={inputs['l1']: randomized_dataset})
    for step in range(1, num_steps + 1):
        offset = (step * batch_size) % (randomized_dataset.shape[0] - batch_size)
        batch_data = encoded_randomized_dataset[offset:(offset + batch_size), :]
        noisy_batch_data = batch_data.copy()
        for i, point in enumerate(noisy_batch_data):
            noisy_batch_data[i, :] = point + np.random.normal(0, 1, noisy_batch_data.shape[1])
        feed_dict = {inputs['l2']: batch_data, targets['l2']: noisy_batch_data}
        _, l = session.run([optimizer_l2, loss_l2], feed_dict=feed_dict)
        if step % 1000 == 0:
            print step, ':', l

Initialized
1000 : 100.616
2000 : 39.0461
3000 : 23.9815
4000 : 15.5616
5000 : 11.6594
6000 : 9.45537
7000 : 8.33887
8000 : 7.54305
9000 : 6.23905
10000 : 5.16105
11000 : 4.83644
12000 : 4.52867
13000 : 8.02212
14000 : 5.08431
15000 : 4.56395
16000 : 4.66459
17000 : 3.8691
18000 : 4.63158
19000 : 3.68647
20000 : 3.8327
21000 : 2.91987
22000 : 4.47652
23000 : 2.59041
24000 : 2.88201
25000 : 3.51531
26000 : 4.05447
27000 : 2.63748
28000 : 3.0164
29000 : 2.74419
30000 : 2.50407
31000 : 2.4288
32000 : 2.99447
33000 : 2.36293
34000 : 2.39094
35000 : 2.17911
36000 : 3.39051
37000 : 2.32618
38000 : 9.0268
39000 : 2.11285
40000 : 1.96725
41000 : 1.93506
42000 : 1.91593
43000 : 1.78285
44000 : 2.05383
45000 : 1.96982


In [72]:
num_steps = 5000
preds = []
new_input=[]
with tf.Session(graph=graph) as session:
    tf.initialize_all_variables().run()
    print ("Initialized")
    encoder_op = encode('l1')
    encoded_dataset = encoder_op.eval(feed_dict={inputs['l1']: combined_dataset[:train_dataset.shape[0], :]})
    encoder_op = encode('l2')
    encoded_dataset = encoder_op.eval(feed_dict={inputs['l2']: encoded_dataset})
    for step in range(1, num_steps + 1):
        offset = (step * batch_size) % (encoded_dataset.shape[0] - batch_size)
        batch_data = encoded_dataset[offset:(offset + batch_size), :]
        batch_labels = list(train_targets[offset:(offset + batch_size)])
        for i, label in enumerate(batch_labels):
            batch_labels[i] = (label == np.arange(2)).astype(np.int32)
        batch_labels = np.array(batch_labels, dtype=np.int32)
        feed_dict = {inputs['l3']: batch_data, targets['l3']: batch_labels}
        _, l = session.run([optimizer_l3, loss_l3], feed_dict=feed_dict)
        if step % 100 == 0:
            print step, ':', l

Initialized
100 : 1.71176
200 : 0.545867
300 : 0.374125
400 : 0.327542
500 : 0.251414
600 : 0.17622
700 : 0.177035
800 : 0.152508
900 : 0.148748
1000 : 0.117589
1100 : 0.138904
1200 : 0.138447
1300 : 0.122112
1400 : 0.0881525
1500 : 0.111471
1600 : 0.110557
1700 : 0.0881293
1800 : 0.072809
1900 : 0.100673
2000 : 0.0706263
2100 : 0.076882
2200 : 0.0668703
2300 : 0.0767106
2400 : 0.0808679
2500 : 0.0678024
2600 : 0.0923288
2700 : 0.0740183
2800 : 0.0437195
2900 : 0.0540447
3000 : 0.0530275
3100 : 0.0626308
3200 : 0.0439253
3300 : 0.042121
3400 : 0.0552381
3500 : 0.0410883
3600 : 0.0367515
3700 : 0.044155
3800 : 0.0381541
3900 : 0.0401123
4000 : 0.0481869
4100 : 0.0433682
4200 : 0.0492217
4300 : 0.0538658
4400 : 0.0586776
4500 : 0.0287999
4600 : 0.0395907
4700 : 0.0380539
4800 : 0.0362041
4900 : 0.0455984
5000 : 0.0517075


In [73]:
preds = []
with tf.Session(graph=graph) as session:
    tf.initialize_all_variables().run()
    encoder_op = encode('l1')
    encoded_dataset = encoder_op.eval(feed_dict={inputs['l1']: combined_dataset[train_dataset.shape[0]:, :]})
    encoder_op = encode('l2')
    encoded_dataset = encoder_op.eval(feed_dict={inputs['l2']: encoded_dataset})
    logit_op = compute_logit('l3')
    softmaxes = get_softmax(logit_op)
    preds = softmaxes.eval(feed_dict={inputs['l3']: encoded_dataset})

In [74]:
def accuracy(predictions, labels):
  return (100.0 * np.sum(np.argmax(predictions, 1) == labels)
          / predictions.shape[0])

In [75]:
accuracy(preds, test_targets)

34.905074520936836

In [79]:
num_steps = 7000
preds = []
randomized_dataset = combined_dataset.copy()
#     np.random.shuffle(randomized_dataset)
new_input=[]
with tf.Session(graph=graph) as session:
    tf.initialize_all_variables().run()
    print ("Initialized")
    for step in range(num_steps):
        offset = (step * batch_size) % (randomized_dataset.shape[0] - batch_size)
        batch_data = randomized_dataset[offset:(offset + batch_size), :]
        noisy_batch_data = batch_data.copy()
        for i, point in enumerate(noisy_batch_data):
            noisy_batch_data[i, :] = point + np.random.normal(0, 1, noisy_batch_data.shape[1])
        feed_dict = {input_data: noisy_batch_data, clean_data: batch_data}
        _, l, predictions = session.run(
          [optimizer, loss, train_prediction], feed_dict=feed_dict)
        if (step % 500 == 0):
            print("Minibatch loss at step", step, ":", l)
#             print "SKLEARN loss", np.sqrt(mean_squared_error(predictions, batch_data))
    new_input=layer_1.eval(feed_dict = {input_data :combined_dataset})
    test_targets_1hot = np.array([(x == np.arange(2)).astype(np.float32) for x in test_targets])
#     print(test_targets_1hot[:5])
    for step in range(370):
        c = np.random.choice(train_targets.shape[0], 2000, replace=False)
        batch_data = train_dataset[:,:-1][c]
        batch_targets = train_targets[c]
        batch_targets=np.array([(x == np.arange(2)).astype(np.float32) for x in batch_targets])
        feed_dict = {clean_data:batch_data, target:batch_targets}
        l, _ = session.run([loss_, optimize_], feed_dict=feed_dict)
        if (step % 10 == 0):
            print("Minibatch logloss at step", step, ":", l)
    new_input = layer_1_f.eval(feed_dict={clean_data:combined_dataset})
    print(session.run(a_, feed_dict={clean_data:combined_dataset[len(train_targets):],target:test_targets_1hot}))

Initialized
('Minibatch loss at step', 0, ':', 222.77254)
('Minibatch loss at step', 500, ':', 89.292938)
('Minibatch loss at step', 1000, ':', 39.540432)
('Minibatch loss at step', 1500, ':', 16.709871)
('Minibatch loss at step', 2000, ':', 26.412645)
('Minibatch loss at step', 2500, ':', 3.530025)
('Minibatch loss at step', 3000, ':', 1.5838422)
('Minibatch loss at step', 3500, ':', 0.6859898)
('Minibatch loss at step', 4000, ':', 0.40181309)
('Minibatch loss at step', 4500, ':', 0.38369343)
('Minibatch loss at step', 5000, ':', 0.31740069)
('Minibatch loss at step', 5500, ':', 0.33525366)
('Minibatch loss at step', 6000, ':', 0.36136743)
('Minibatch loss at step', 6500, ':', 0.29976884)
('Minibatch logloss at step', 0, ':', 2.0832458)
('Minibatch logloss at step', 10, ':', 0.43901727)
('Minibatch logloss at step', 20, ':', 0.27650648)
('Minibatch logloss at step', 30, ':', 0.19391036)
('Minibatch logloss at step', 40, ':', 0.22965193)
('Minibatch logloss at step', 50, ':', 0.1282324

In [36]:
#saving new train & test data 
from collections import Counter
print(train_dataset.shape)
new_train_data=new_input[:train_dataset.shape[0]]
new_train_data=np.c_[new_train_data,train_targets]
new_test_data=new_input[train_dataset.shape[0]:]
new_test_data=np.c_[new_test_data, test_targets]
print(new_train_data.shape, new_test_data.shape)
# np.savez('out/new_input.npz',train=new_train_data, test=new_test_data)

(125973, 123)
((125973, 201), (22544, 201))


In [37]:
model = LogisticRegression(C=1,class_weight='balanced',n_jobs=-1)

In [37]:
print (cross_val_score(model, new_train_data[:, :-1], train_targets, cv=3, scoring='precision_macro').mean())

0.731867870076


In [45]:
new_train_data = np.load('out/new_input.npz')['train']
new_test_data = np.load('out/new_input.npz')['test']

In [38]:
model.fit(new_train_data[:, :-1], train_targets)

LogisticRegression(C=1, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=-1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [39]:
preds = model.predict(new_test_data[:, :-1])
# np.savez('out/preds.npz',preds=preds)

In [40]:
print(accuracy_score(preds, test_targets))

0.740684882896


In [41]:
print(confusion_matrix(test_targets, preds))

[[7736 5097]
 [ 749 8962]]
