In [5]:
%pylab inline

import os
import numpy as np
import pandas as pd
from scipy.misc import imread
from sklearn.metrics import accuracy_score
import tensorflow as tf

Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"


# Setup the data

In [108]:
seed = 128
rng = np.random.RandomState(seed)

In [109]:
root_dir = os.path.abspath('..')
data_dir = os.path.join(root_dir, 'data')

# check for existence
os.path.exists(root_dir)
os.path.exists(data_dir)

True

In [110]:
train = pd.read_csv(os.path.join(data_dir, 'train.csv'), dtype=np.float32)
test = pd.read_csv(os.path.join(data_dir, 'test.csv'), header=None, dtype=np.float32)

In [111]:
train_x = train.iloc[:, :-1].values
test_x = test.iloc[:, :].values

In [112]:
split_size = int(train_x.shape[0]*0.7)

train_x, val_x = train_x[:split_size], train_x[split_size:]
train_y, val_y = train.popularity.values[:split_size], train.popularity.values[split_size:]

# Auxilary functions

In [113]:
def dense_to_one_hot(labels_dense, num_classes=4):
    """Convert class labels from scalars to one-hot vectors"""
    s = pd.Series(list(range(1,num_classes+1,1)))
    labels_one_hot = pd.get_dummies(labels_dense)
    
#     num_labels = labels_dense.shape[0]
#     index_offset = np.arange(num_labels) * num_classes
#     labels_one_hot = np.zeros((num_labels, num_classes))
#     labels_one_hot.flat[index_offset + labels_dense.ravel()] = 1
    
    return labels_one_hot

def preproc(unclean_batch_x):
    """Convert values to range 0-1, with mean centered"""
    temp_batch = (unclean_batch_x-unclean_batch_x.min()) / (unclean_batch_x.max()-unclean_batch_x.min())
    
    return temp_batch

def batch_creator(batch_size, dataset_length, dataset_name):
    """Create batch with random samples and return appropriate format"""
    batch_mask = rng.choice(dataset_length, batch_size)
    
    batch_x = eval(dataset_name + '_x')[[batch_mask]].reshape(-1, input_num_units)
    batch_x = preproc(batch_x)
    
    if dataset_name == 'train':
        batch_y = eval(dataset_name).ix[batch_mask, 'popularity'].values
        batch_y = dense_to_one_hot(batch_y)
        
    return batch_x, batch_y

# Neural Net definition

In [114]:
### set all variables

# number of neurons in each layer
input_num_units = 6
hidden_num_units = 650
output_num_units = 4

# define placeholders
x = tf.placeholder(tf.float32, [None, input_num_units])
y = tf.placeholder(tf.float32, [None, output_num_units])

# set remaining variables
epochs = 498
batch_size = 512
learning_rate = 0.005

### define weights and biases of the neural network (refer this article if you don't understand the terminologies)

weights = {
    'hidden': tf.Variable(tf.random_normal([input_num_units, hidden_num_units], seed=seed)),
    'output': tf.Variable(tf.random_normal([hidden_num_units, output_num_units], seed=seed))
}

biases = {
    'hidden': tf.Variable(tf.random_normal([hidden_num_units], seed=seed)),
    'output': tf.Variable(tf.random_normal([output_num_units], seed=seed))
}

### Computational Graph

In [115]:
hidden_layer = tf.add(tf.matmul(x, weights['hidden']), biases['hidden'])
hidden_layer = tf.nn.relu(hidden_layer)

output_layer = tf.matmul(hidden_layer, weights['output']) + biases['output']


### Cost function and Optimizer

In [116]:
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=y, logits=output_layer))
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)

In [117]:
init = tf.global_variables_initializer()

In [118]:
with tf.Session() as sess:
    sess.run(init)
    
    for epoch in range(epochs):
        avg_cost = 0
        total_batch = int(train.shape[0]/batch_size)
        for i in range(total_batch):
            batch_x, batch_y = batch_creator(batch_size, train_x.shape[0], 'train')
            _, c = sess.run([optimizer, cost], feed_dict = {x: batch_x, y: batch_y})
            
            avg_cost += c / total_batch
            
        print( "Epoch:", (epoch+1), "cost =", "{:.5f}".format(avg_cost) )
    
    print( "\nTraining complete!" )
    
    
    # find predictions on val set
    pred_temp = tf.equal(tf.argmax(output_layer, 1), tf.argmax(y, 1))
    accuracy = tf.reduce_mean(tf.cast(pred_temp, "float"))
    print( "Validation Accuracy:", accuracy.eval({x: val_x.reshape(-1, input_num_units), y: dense_to_one_hot(val_y)}) )
    
    predict = tf.argmax(output_layer, 1)
    pred = predict.eval({x: test_x.reshape(-1, input_num_units)})

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated


Epoch: 1 cost = 3.88131
Epoch: 2 cost = 2.88053
Epoch: 3 cost = 2.82461
Epoch: 4 cost = 2.58580
Epoch: 5 cost = 2.33191
Epoch: 6 cost = 2.44481
Epoch: 7 cost = 2.08236
Epoch: 8 cost = 1.96777
Epoch: 9 cost = 1.74780
Epoch: 10 cost = 1.82522
Epoch: 11 cost = 1.74649
Epoch: 12 cost = 1.80884
Epoch: 13 cost = 1.59858
Epoch: 14 cost = 1.46942
Epoch: 15 cost = 1.41847
Epoch: 16 cost = 1.34339
Epoch: 17 cost = 1.19429
Epoch: 18 cost = 1.22432
Epoch: 19 cost = 1.17421
Epoch: 20 cost = 1.10748
Epoch: 21 cost = 1.00562
Epoch: 22 cost = 0.93378
Epoch: 23 cost = 1.09359
Epoch: 24 cost = 1.01978
Epoch: 25 cost = 0.95891
Epoch: 26 cost = 0.89217
Epoch: 27 cost = 0.90632
Epoch: 28 cost = 0.89167
Epoch: 29 cost = 0.85212
Epoch: 30 cost = 0.89943
Epoch: 31 cost = 0.81073
Epoch: 32 cost = 0.80039
Epoch: 33 cost = 0.82612
Epoch: 34 cost = 0.76147
Epoch: 35 cost = 0.75358
Epoch: 36 cost = 0.73932
Epoch: 37 cost = 0.71289
Epoch: 38 cost = 0.78354
Epoch: 39 cost = 0.71961
Epoch: 40 cost = 0.76190
Epoch: 41

Epoch: 321 cost = 0.29029
Epoch: 322 cost = 0.25700
Epoch: 323 cost = 0.27404
Epoch: 324 cost = 0.26867
Epoch: 325 cost = 0.25165
Epoch: 326 cost = 0.23586
Epoch: 327 cost = 0.25198
Epoch: 328 cost = 0.23353
Epoch: 329 cost = 0.24476
Epoch: 330 cost = 0.24982
Epoch: 331 cost = 0.23253
Epoch: 332 cost = 0.27382
Epoch: 333 cost = 0.23291
Epoch: 334 cost = 0.23271
Epoch: 335 cost = 0.26035
Epoch: 336 cost = 0.24028
Epoch: 337 cost = 0.24947
Epoch: 338 cost = 0.23193
Epoch: 339 cost = 0.23930
Epoch: 340 cost = 0.24416
Epoch: 341 cost = 0.25305
Epoch: 342 cost = 0.22932
Epoch: 343 cost = 0.23547
Epoch: 344 cost = 0.26015
Epoch: 345 cost = 0.27109
Epoch: 346 cost = 0.23074
Epoch: 347 cost = 0.24350
Epoch: 348 cost = 0.24099
Epoch: 349 cost = 0.26988
Epoch: 350 cost = 0.23341
Epoch: 351 cost = 0.22866
Epoch: 352 cost = 0.26907
Epoch: 353 cost = 0.22411
Epoch: 354 cost = 0.22578
Epoch: 355 cost = 0.22651
Epoch: 356 cost = 0.22035
Epoch: 357 cost = 0.24969
Epoch: 358 cost = 0.26522
Epoch: 359 c

In [119]:
np.savetxt("out.txt", pred, fmt="%d",)