## Create the machine learning algorithm



### Import the relevant libraries

In [1]:
import numpy as np
import tensorflow as tf

### Data

In [2]:
# we load the train data in the temporary variable

npz = np.load('Audiobooks_data_train.npz')

# we extract the inputs and targets using the keyword under which we saved them
train_inputs = npz['inputs'].astype(np.float)
train_targets = npz['targets'].astype(np.int)

# we load the validation data inputs and targets in the temporary variable
npz = np.load('Audiobooks_data_validation.npz')
validation_inputs, validation_targets = npz['inputs'].astype(np.float), npz['targets'].astype(np.int)

# we load the test data inputs and targets in the temporary variable
npz = np.load('Audiobooks_data_test.npz')
test_inputs, test_targets = npz['inputs'].astype(np.float), npz['targets'].astype(np.int)

### Model
Outline, optimizers, loss, early stopping and training

In [3]:
input_size = 10
output_size = 2
hidden_layer_size = 70
    
model = tf.keras.Sequential([
    
    tf.keras.layers.Dense(hidden_layer_size, activation='relu'), 
    tf.keras.layers.Dense(hidden_layer_size, activation='relu'), 
    tf.keras.layers.Dense(hidden_layer_size, activation='relu'),
    tf.keras.layers.Dense(hidden_layer_size, activation='relu'),
    tf.keras.layers.Dense(output_size, activation='softmax') 
])



model.compile(optimizer='adamax', loss='sparse_categorical_crossentropy', metrics=['accuracy'])


batch_size = 200

max_epochs = 100

early_stopping = tf.keras.callbacks.EarlyStopping(patience=2)


model.fit(train_inputs, 
          train_targets, 
          batch_size=batch_size,
          epochs=max_epochs, 
          callbacks=[early_stopping], 
          validation_data=(validation_inputs, validation_targets), 
          verbose = 2 
          )  

Train on 3579 samples, validate on 447 samples
Epoch 1/100
3579/3579 - 1s - loss: 0.5952 - accuracy: 0.8181 - val_loss: 0.5164 - val_accuracy: 0.8345
Epoch 2/100
3579/3579 - 0s - loss: 0.4180 - accuracy: 0.8793 - val_loss: 0.3778 - val_accuracy: 0.8523
Epoch 3/100
3579/3579 - 0s - loss: 0.3272 - accuracy: 0.8843 - val_loss: 0.3497 - val_accuracy: 0.8591
Epoch 4/100
3579/3579 - 0s - loss: 0.3038 - accuracy: 0.8863 - val_loss: 0.3399 - val_accuracy: 0.8658
Epoch 5/100
3579/3579 - 0s - loss: 0.2891 - accuracy: 0.8919 - val_loss: 0.3248 - val_accuracy: 0.8702
Epoch 6/100
3579/3579 - 0s - loss: 0.2787 - accuracy: 0.8941 - val_loss: 0.3226 - val_accuracy: 0.8702
Epoch 7/100
3579/3579 - 0s - loss: 0.2710 - accuracy: 0.8963 - val_loss: 0.3170 - val_accuracy: 0.8747
Epoch 8/100
3579/3579 - 0s - loss: 0.2669 - accuracy: 0.8989 - val_loss: 0.3177 - val_accuracy: 0.8770
Epoch 9/100
3579/3579 - 0s - loss: 0.2621 - accuracy: 0.8989 - val_loss: 0.3128 - val_accuracy: 0.8770
Epoch 10/100
3579/3579 - 0

<tensorflow.python.keras.callbacks.History at 0x26d40745fc8>

## Test the model

As we discussed in the lectures, after training on the training data and validating on the validation data, we test the final prediction power of our model by running it on the test dataset that the algorithm has NEVER seen before.

In [4]:
test_loss, test_accuracy = model.evaluate(test_inputs, test_targets)



In [5]:
print('\nTest loss: {0:.2f}. Test accuracy: {1:.2f}%'.format(test_loss, test_accuracy*100.))


Test loss: 0.26. Test accuracy: 90.18%


In [6]:
pred = model.predict_classes(test_inputs)

In [7]:
import pandas as pd

In [8]:
output = pd.DataFrame(pred,columns=['Predicted'])


In [9]:
output['Actual'] = test_targets

In [10]:
confusion_matrix = pd.crosstab(output['Actual'], output['Predicted'], rownames=['Actual'], colnames=['Predicted'])
confusion_matrix

Predicted,0,1
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,201,24
1,20,203


### Accuracy
- It represents percentage of prediction our model got right


In [11]:
accuracy = (confusion_matrix.loc[1,1]+confusion_matrix.loc[0,0])/(confusion_matrix.loc[1,1]+confusion_matrix.loc[0,1] + confusion_matrix.loc[1,0] + confusion_matrix.loc[0,0])
print('\n Accuracy: {0:.2f}%'.format(accuracy*100))


 Accuracy: 90.18%


### Precision
- It represents out of total predicted returning users how many percent of users actually returned

In [12]:
precision = confusion_matrix.loc[1,1]/(confusion_matrix.loc[1,1]+confusion_matrix.loc[0,1])
print('\n Precision: {0:.2f}%'.format(precision*100))


 Precision: 89.43%


### Recall
- It represents out of total returning users how many we predicted. It is also names as hit ratio. 

In [13]:
recall = confusion_matrix.loc[1,1]/(confusion_matrix.loc[1,1]+confusion_matrix.loc[1,0])
print('\n Recall: {0:.2f}%'.format(precision*100))


 Recall: 89.43%
