In [1]:
import numpy as np
import tensorflow as tf
from sklearn import preprocessing

In [2]:
raw_data_all = np.loadtxt('Audiobooks_data.csv', delimiter = ',')
unscaled_input_all = raw_data_all[:, 1: -1]
target_all = raw_data_all[:, -1]

In [3]:
num_one_data = int(np.sum(target_all))
zero_target_counter = 0
indicase_to_remove = []

for i in range (target_all.shape[0]):
    if target_all[i] == 0:
        zero_target_counter += 1
        if zero_target_counter >= num_one_data:
            indicase_to_remove.append(i)
            
unscaled_input_all_balance = np.delete(unscaled_input_all, indicase_to_remove, axis = 0)
target_all_balance = np.delete(target_all, indicase_to_remove, axis = 0)

In [4]:
scale_input = preprocessing.scale(unscaled_input_all_balance)

In [5]:
shuffled_indicase = np.arange(unscaled_input_all_balance.shape[0])
np.random.shuffle(shuffled_indicase)

shuffled_input = unscaled_input_all_balance[shuffled_indicase]
shuffled_target = target_all_balance[shuffled_indicase]

In [6]:
from sklearn.model_selection import train_test_split

samples_count = shuffled_input.shape[0]
train_sample_count = int(0.8 * samples_count)
validation_sample_count = int(0.1 * samples_count)
test_sample_count = samples_count - train_sample_count - validation_sample_count

#train_size = 0.8
input_train, input_rem, target_train, target_rem = train_test_split(shuffled_input, shuffled_target, train_size = 0.8)

#test_size = 0.5
input_validation, input_test, target_validation, target_test = train_test_split(input_rem, target_rem, test_size = 0.5)

print(np.sum(target_train), train_sample_count, np.sum(target_train/train_sample_count))
print(np.sum(target_validation), validation_sample_count, np.sum(target_validation/validation_sample_count))
print(np.sum(target_test), test_sample_count, np.sum(target_test/test_sample_count))

1801.0 3578 0.5033538289547235
215.0 447 0.4809843400447426
221.0 448 0.4933035714285713


In [7]:
np.savez('Audiobooks_data_train', inputs = input_train, targets = target_train)
np.savez('Audiobooks_data_validation', inputs = input_validation, targets = target_validation)
np.savez('Audiobooks_data_test', inputs = input_test, targets = target_test)

In [8]:
npz = np.load('Audiobooks_data_train.npz')
train_input, train_target = npz['inputs'].astype(np.float_), npz['targets'].astype(np.int_)

npz = np.load('Audiobooks_data_validation.npz')
validation_input, validation_target = npz['inputs'].astype(np.float_), npz['targets'].astype(np.int_)

npz = np.load('Audiobooks_data_test.npz')
test_input, test_target = npz['inputs'].astype(np.float_), npz['targets'].astype(np.int_)

In [9]:
# Create a class that will do the batching for the algorithm
# This code is extremely reusable. You should just change Audiobooks_data everywhere in the code
class Audiobooks_Data_Reader():
    # Dataset is a mandatory arugment, while the batch_size is optional
    # If you don't input batch_size, it will automatically take the value: None
    def __init__(self, dataset, batch_size = None):
    
        # The dataset that loads is one of "train", "validation", "test".
        # e.g. if I call this class with x('train',5), it will load 'Audiobooks_data_train.npz' with a batch size of 5.
        npz = np.load('Audiobooks_data_{0}.npz'.format(dataset))
        
        # Two variables that take the values of the inputs and the targets. Inputs are floats, targets are integers
        self.inputs, self.targets = npz['inputs'].astype(np.float), npz['targets'].astype(np.int)
        
        # Counts the batch number, given the size you feed it later
        # If the batch size is None, we are either validating or testing, so we want to take the data in a single batch
        if batch_size is None:
            self.batch_size = self.inputs.shape[0]
        else:
            self.batch_size = batch_size
        self.curr_batch = 0
        self.batch_count = self.inputs.shape[0] // self.batch_size
    
    # A method which loads the next batch
    def __next__(self):
        if self.curr_batch >= self.batch_count:
            self.curr_batch = 0
            raise StopIteration()
            
        # You slice the dataset in batches and then the "next" function loads them one after the other
        batch_slice = slice(self.curr_batch * self.batch_size, (self.curr_batch + 1) * self.batch_size)
        inputs_batch = self.inputs[batch_slice]
        targets_batch = self.targets[batch_slice]
        self.curr_batch += 1
        
        # One-hot encode the targets. In this example it's a bit superfluous since we have a 0/1 column 
        # as a target already but we're giving you the code regardless, as it will be useful for any 
        # classification task with more than one target column
        classes_num = 2
        targets_one_hot = np.zeros((targets_batch.shape[0], classes_num))
        targets_one_hot[range(targets_batch.shape[0]), targets_batch] = 1
        
        # The function will return the inputs batch and the one-hot encoded targets
        return inputs_batch, targets_one_hot
    
        
    # A method needed for iterating over the batches, as we will put them in a loop
    # This tells Python that the class we're defining is iterable, i.e. that we can use it like:
    # for input, output in data: 
        # do things
    # An iterator in Python is a class with a method __next__ that defines exactly how to iterate through its objects
    def __iter__(self):
        return self

In [12]:
input_size = 10
output_size = 2
hidden_layer_size = 50

tf.compat.v1.reset_default_graph()
tf.compat.v1.disable_eager_execution()

inputs = tf.compat.v1.placeholder(tf.float32, shape = (None, input_size))
targets = tf.compat.v1.placeholder(tf.int32, shape = (None, output_size))

weights_1 = tf.compat.v1.get_variable("weights_1", [input_size, hidden_layer_size])
biases_1 = tf.compat.v1.get_variable("biases_1", [hidden_layer_size])
outputs_1 = tf.nn.relu(tf.matmul(inputs, weights_1) + biases_1)

weights_2 = tf.compat.v1.get_variable("weights_2", [hidden_layer_size, hidden_layer_size])
biases_2 = tf.compat.v1.get_variable("biases_2", [hidden_layer_size])
outputs_2 = tf.nn.relu(tf.matmul(outputs_1, weights_2) + biases_2)

weights_3 = tf.compat.v1.get_variable("weights_3", [hidden_layer_size, output_size])
biases_3 = tf.compat.v1.get_variable("biases_3", [output_size])
outputs = tf.matmul(outputs_2, weights_3) + biases_3

loss = tf.nn.softmax_cross_entropy_with_logits(logits = outputs, labels = targets)
mean_loss = tf.reduce_mean(loss)

optimize = tf.compat.v1.train.AdamOptimizer(learning_rate = 0.001).minimize(mean_loss)
out_equals_target = tf.equal(tf.argmax(outputs, 1), tf.argmax(targets, 1))
accuracy = tf.reduce_mean(tf.cast(out_equals_target, tf.float32))

#tf.compat.v1.disable_eager_execution()

sess = tf.compat.v1.InteractiveSession()
initializer = tf.compat.v1.global_variables_initializer()
sess.run(initializer)

batch_size = 100
max_epochs = 100
prev_validation_loss = 9999999.

train_data = Audiobooks_Data_Reader('train', batch_size)
validation_data = Audiobooks_Data_Reader('validation')

for epoch_counter in range(max_epochs):
    
    curr_epoch_loss = 0.
    
    for input_batch, target_batch in train_data:
        _, batch_loss = sess.run([optimize, mean_loss],
            feed_dict = {inputs: input_batch, targets: target_batch})
        
        curr_epoch_loss += batch_loss
        
    curr_epoch_loss /= train_data.batch_count
    
    validation_loss = 0.
    validation_accuracy = 0.
    
    for input_batch, target_batch in validation_data:
        validation_loss, validation_accuracy = sess.run([mean_loss, accuracy],
        feed_dict = {inputs: input_batch, targets: target_batch})
        
    print('Epoch' + str(epoch_counter + 1) + 
         ', Training Loss: ' + '{0:.3f}'.format(curr_epoch_loss) +
         ', Validation Loss: ' + '{0:.3f}'.format(validation_loss) +
         ', Validation Accuracy: ' + '{0:.3f}'.format(validation_accuracy * 100) + '%')
    
    if validation_loss > prev_validation_loss:
        break
    
    prev_validation_loss = validation_loss
    
print('End of Training') 

Epoch1, Training Loss: 55.273, Validation Loss: 4.202, Validation Accuracy: 69.351%
Epoch2, Training Loss: 2.873, Validation Loss: 1.528, Validation Accuracy: 65.772%
Epoch3, Training Loss: 0.937, Validation Loss: 0.427, Validation Accuracy: 79.642%
Epoch4, Training Loss: 0.492, Validation Loss: 0.408, Validation Accuracy: 78.747%
Epoch5, Training Loss: 0.505, Validation Loss: 0.475, Validation Accuracy: 75.392%
End of Training


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  self.inputs, self.targets = npz['inputs'].astype(np.float), npz['targets'].astype(np.int)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  self.inputs, self.targets = npz['inputs'].astype(np.float), npz['targets'].astype(np.int)


In [13]:
test_data = Audiobooks_Data_Reader('test')

for input_batch, target_batch in test_data:
    test_accuracy = sess.run([accuracy],
    feed_dict = {inputs: input_batch, targets: target_batch})

test_accuracy_percent = test_accuracy[0] * 100.
print('Test Accuracy: ' + '{:.2f}'.format(test_accuracy_percent) + '%')

Test Accuracy: 74.33%


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  self.inputs, self.targets = npz['inputs'].astype(np.float), npz['targets'].astype(np.int)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  self.inputs, self.targets = npz['inputs'].astype(np.float), npz['targets'].astype(np.int)
