In [1]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import tensorflow as tf

In [2]:
raw_data = pd.read_csv('./../data/Audiobooks_data_with_column_headers.csv')

In [3]:
raw_data.head()

Unnamed: 0,ID,Book length (mins)_overall,Book length (mins)_avg,Price_overall,Price_avg,Review,Review 10/10,Minutes listened,Completion,Support requests,Last visited minus purchase date,Targets
0,994,1620.0,1620,19.73,19.73,1,10.0,0.99,1603.8,5,92,0
1,1143,2160.0,2160,5.33,5.33,0,8.91,0.0,0.0,0,0,0
2,2059,2160.0,2160,5.33,5.33,0,8.91,0.0,0.0,0,388,0
3,2882,1620.0,1620,5.96,5.96,0,8.91,0.42,680.4,1,129,0
4,3342,2160.0,2160,5.33,5.33,0,8.91,0.22,475.2,0,361,0


In [4]:
raw_data.shape

(14084, 12)

In [5]:
targets_zeros = raw_data[raw_data['Targets'] == 0]
targets_zeros.shape

(11847, 12)

In [6]:
targets_ones = raw_data[raw_data['Targets'] == 1]
targets_ones.shape

(2237, 12)

In [7]:
min_count = min(len(targets_zeros), len(targets_ones))
min_count

2237

In [8]:
zeros_sample = raw_data[raw_data['Targets'] == 0].sample(n=min_count, random_state=42)
ones_sample = raw_data[raw_data['Targets'] == 1].sample(n=min_count, random_state=42)

In [9]:
balanced_data = pd.concat([zeros_sample, ones_sample]).sample(frac=1, random_state=42)

In [10]:
balanced_data.reset_index(drop=True, inplace=True)

In [11]:
balanced_data.shape

(4474, 12)

In [12]:
inputs = balanced_data.iloc[:, 1:-1]
targets = balanced_data.iloc[:, -1]

In [13]:
inputs.shape

(4474, 10)

In [14]:
targets.shape

(4474,)

In [15]:
scaled_inputs = preprocessing.scale(inputs)

In [16]:
train_inputs, temp_inputs = train_test_split(scaled_inputs, test_size=0.2, random_state=42)
validation_inputs, test_inputs = train_test_split(temp_inputs, test_size=0.5, random_state=42)
train_inputs.shape, validation_inputs.shape, test_inputs.shape

((3579, 10), (447, 10), (448, 10))

In [17]:
train_targets, temp_targets = train_test_split(targets, test_size=0.2, random_state=42)
validation_targets, test_targets = train_test_split(temp_targets, test_size=0.5, random_state=42)
train_targets.shape, validation_targets.shape, test_targets.shape

((3579,), (447,), (448,))

In [18]:
np.savez('Audiobooks_data_train', inputs=train_inputs, targets=train_targets)
np.savez('Audiobooks_data_validation', inputs=validation_inputs, targets=validation_targets)
np.savez('Audiobooks_data_test', inputs=test_inputs, targets=test_targets)

In [19]:
npz = np.load('Audiobooks_data_train.npz')
train_inputs = npz['inputs'].astype(np.float64)
train_targets = npz['targets'].astype(np.int64)

In [20]:
npz = np.load('Audiobooks_data_validation.npz')
validation_inputs = npz['inputs'].astype(np.float64)
validation_targets = npz['targets'].astype(np.int64)

In [21]:
npz = np.load('Audiobooks_data_test.npz')
test_inputs = npz['inputs'].astype(np.float64)
test_targets = npz['targets'].astype(np.int64)

In [22]:
input_size = 10
hidden_layer_size = 50
output_size = 2
early_stopping = tf.keras.callbacks.EarlyStopping()

model = tf.keras.Sequential([
    tf.keras.layers.Dense(hidden_layer_size, activation='relu'),
    tf.keras.layers.Dense(hidden_layer_size, activation='relu'),
    tf.keras.layers.Dense(output_size, activation='softmax')
])

In [23]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [24]:
model.fit(
    train_inputs,
    train_targets,
    batch_size=100,
    epochs=100,
    callbacks=[early_stopping],
    validation_data=(validation_inputs, validation_targets),
    verbose=1
)

Epoch 1/100
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 13ms/step - accuracy: 0.5920 - loss: 0.6518 - val_accuracy: 0.7293 - val_loss: 0.5304
Epoch 2/100
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.7730 - loss: 0.4993 - val_accuracy: 0.7740 - val_loss: 0.4607
Epoch 3/100
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.7807 - loss: 0.4289 - val_accuracy: 0.7852 - val_loss: 0.4289
Epoch 4/100
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.8099 - loss: 0.3963 - val_accuracy: 0.7740 - val_loss: 0.4147
Epoch 5/100
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.8011 - loss: 0.3819 - val_accuracy: 0.7808 - val_loss: 0.4040
Epoch 6/100
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.8077 - loss: 0.3733 - val_accuracy: 0.7875 - val_loss: 0.3957
Epoch 7/100
[1m36/36[0m [32m━━

<keras.src.callbacks.history.History at 0x264c59d2e10>

In [25]:
test_loss, test_accuracy = model.evaluate(test_inputs, test_targets)

[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.7940 - loss: 0.4025 


In [26]:
print('Test loss: {0:.2f} Test accuracy: {1:.2f}%'.format(test_loss, test_accuracy*100.))

Test loss: 0.40 Test accuracy: 80.13%
