# <span style="color:#4040a1; font-family: Trebuchet MS; font-size: 70px; font-weight: bold;">Audiobooks Analysis</span>

In [None]:
import numpy as np
from sklearn import preprocessing

In [None]:
raw = np.loadtxt('/content/Audiobooks_data.csv', delimiter = ',')

raw_features = raw[:,1:-1]
raw_targets = raw[:,-1]

# <span style="color:#4040a1; font-family: Trebuchet MS; font-size: 60px; font-weight: bold;">Data Preparation</span>

# <span style="color:#4040a1; font-family: Trebuchet MS; font-size: 40px; font-weight: bold;">Balance the data</span>

In [None]:
# Calculate the number of instances (samples) with target value 1 in the 'raw_targets' array.
num_one_targets = int(np.sum(raw_targets))

# Initialize a counter for instances with target value 0 and a list to store their indices.
zero_targets_counter = 0
indices_to_remove = []

# Iterate through each element (instance) in the 'raw_targets' array.
for i in range(raw_targets.shape[0]):
    # Check if the target value at index 'i' is 0.
    if raw_targets[i] == 0:
        # Increment the counter for instances with target value 0.
        zero_targets_counter += 1
        # If the number of instances with target value 0 exceeds the number of instances with target value 1,
        # add the index 'i' to the list of indices to remove.
        if zero_targets_counter > num_one_targets:
            indices_to_remove.append(i)

# Remove rows from the 'raw_features' and 'raw_targets' arrays where the target value is 0.
unscaled_inputs_equal_priors = np.delete(raw_features, indices_to_remove, axis=0)
targets_equal_priors = np.delete(raw_targets, indices_to_remove, axis=0)

# <span style="color:#4040a1; font-family: Trebuchet MS; font-size: 40px; font-weight: bold;">Standardize inputs</span>

In [None]:
scaled_inputs = preprocessing.scale(unscaled_inputs_equal_priors)

# <span style="color:#4040a1; font-family: Trebuchet MS; font-size: 40px; font-weight: bold;">Shuffle the data</span>

In [None]:
# Create an array of sequential indices from 0 to the number of rows in 'scaled_inputs'.
shuffled_indices = np.arange(scaled_inputs.shape[0])

# Shuffle the 'shuffled_indices' array randomly to create a new order.
np.random.shuffle(shuffled_indices)

# Use the shuffled indices to rearrange the rows of 'scaled_inputs'.
shuffled_inputs = scaled_inputs[shuffled_indices]

# Use the same shuffled indices to rearrange the rows of 'targets_equal_priors'.
shuffled_targets = targets_equal_priors[shuffled_indices]

# <span style="color:#4040a1; font-family: Trebuchet MS; font-size: 40px; font-weight: bold;">Train / Validation / Test Split</span>

In [None]:
# Get the total number of samples in the shuffled data.
samples_count = shuffled_inputs.shape[0]

# Calculate the number of samples for the training, validation, and test sets based on proportions.
train_samples_count = int(0.8 * samples_count)
validation_samples_count = int(0.1 * samples_count)
test_samples_count = samples_count - train_samples_count - validation_samples_count

# Split the shuffled data into training, validation, and test sets using array slicing.
train_inputs = shuffled_inputs[:train_samples_count]
train_targets = shuffled_targets[:train_samples_count]

validation_inputs = shuffled_inputs[train_samples_count:train_samples_count+validation_samples_count]
validation_targets = shuffled_targets[train_samples_count:train_samples_count+validation_samples_count]

test_inputs = shuffled_inputs[train_samples_count+validation_samples_count:]
test_targets = shuffled_targets[train_samples_count+validation_samples_count:]

# Print statistics about the training, validation, and test sets.
print("Training set - Sum of targets, Number of samples, Mean target value per sample:")
print(np.sum(train_targets), train_samples_count, np.sum(train_targets) / train_samples_count)

print("Validation set - Sum of targets, Number of samples, Mean target value per sample:")
print(np.sum(validation_targets), validation_samples_count, np.sum(validation_targets) / validation_samples_count)

print("Test set - Sum of targets, Number of samples, Mean target value per sample:")
print(np.sum(test_targets), test_samples_count, np.sum(test_targets) / test_samples_count)

Training set - Sum of targets, Number of samples, Mean target value per sample:
1792.0 3579 0.5006985191394244
Validation set - Sum of targets, Number of samples, Mean target value per sample:
215.0 447 0.4809843400447427
Test set - Sum of targets, Number of samples, Mean target value per sample:
230.0 448 0.5133928571428571


In [None]:
# Save the training data (inputs and targets) into an .npz file named 'Audiobooks_data_train'.
np.savez('Audiobooks_data_train', inputs=train_inputs, targets=train_targets)

# Save the validation data (inputs and targets) into an .npz file named 'Audiobooks_data_validation'.
np.savez('Audiobooks_data_validation', inputs=validation_inputs, targets=validation_targets)

# Save the test data (inputs and targets) into an .npz file named 'Audiobooks_data_test'.
np.savez('Audiobooks_data_test', inputs=test_inputs, targets=test_targets)

# <span style="color:#4040a1; font-family: Trebuchet MS; font-size: 60px; font-weight: bold;">Modeling</span>

In [None]:
import tensorflow as tf

# <span style="color:#4040a1; font-family: Trebuchet MS; font-size: 40px; font-weight: bold;">Load data</span>

In [None]:
# Load the training data from the 'Audiobooks_data_train.npz' file.
npz = np.load('Audiobooks_data_train.npz')
# Extract the 'inputs' array and 'targets' array, casting them to float and int data types, respectively.
train_inputs, train_targets = npz['inputs'].astype(float), npz['targets'].astype(int)

# Load the validation data from the 'Audiobooks_data_validation.npz' file.
npz = np.load('Audiobooks_data_validation.npz')
# Extract the 'inputs' array and 'targets' array, casting them to float and int data types, respectively.
validation_inputs, validation_targets = npz['inputs'].astype(float), npz['targets'].astype(int)

# Load the test data from the 'Audiobooks_data_test.npz' file.
npz = np.load('Audiobooks_data_test.npz')
# Extract the 'inputs' array and 'targets' array, casting them to float and int data types, respectively.
test_inputs, test_targets = npz['inputs'].astype(float), npz['targets'].astype(int)

# <span style="color:#4040a1; font-family: Trebuchet MS; font-size: 40px; font-weight: bold;">Train & Validation</span>

In [None]:
# Define the size of the input, output, and hidden layers.
input_size = 10
output_size = 2
hidden_layer_size = 50

# Create a sequential model.
model = tf.keras.Sequential([
    tf.keras.layers.Dense(hidden_layer_size, activation='relu'),  # 1st hidden layer with ReLU activation
    tf.keras.layers.Dense(hidden_layer_size, activation='relu'),  # 2nd hidden layer with ReLU activation
    tf.keras.layers.Dense(output_size, activation='softmax')  # Output layer with softmax activation
])

# Compile the model, specifying the optimizer, loss function, and evaluation metric.
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Set the batch size for training.
batch_size = 100

# Set the maximum number of epochs (training iterations).
max_epochs = 100

# Train the model using the training data and validate it using the validation data.
model.fit(
    train_inputs,  # Training inputs
    train_targets,  # Training targets
    batch_size=batch_size,  # Batch size
    epochs=max_epochs,  # Number of training epochs
    validation_data=(validation_inputs, validation_targets),  # Validation data
    verbose=2  # Verbosity level for training progress
)

Epoch 1/100
36/36 - 1s - loss: 0.6047 - accuracy: 0.6725 - val_loss: 0.5287 - val_accuracy: 0.7159 - 599ms/epoch - 17ms/step
Epoch 2/100
36/36 - 0s - loss: 0.4782 - accuracy: 0.7645 - val_loss: 0.4514 - val_accuracy: 0.7494 - 60ms/epoch - 2ms/step
Epoch 3/100
36/36 - 0s - loss: 0.4167 - accuracy: 0.7896 - val_loss: 0.4136 - val_accuracy: 0.7830 - 59ms/epoch - 2ms/step
Epoch 4/100
36/36 - 0s - loss: 0.3892 - accuracy: 0.7924 - val_loss: 0.3984 - val_accuracy: 0.7785 - 77ms/epoch - 2ms/step
Epoch 5/100
36/36 - 0s - loss: 0.3735 - accuracy: 0.8030 - val_loss: 0.3871 - val_accuracy: 0.7763 - 61ms/epoch - 2ms/step
Epoch 6/100
36/36 - 0s - loss: 0.3629 - accuracy: 0.8047 - val_loss: 0.3815 - val_accuracy: 0.7875 - 60ms/epoch - 2ms/step
Epoch 7/100
36/36 - 0s - loss: 0.3579 - accuracy: 0.8041 - val_loss: 0.3867 - val_accuracy: 0.7808 - 74ms/epoch - 2ms/step
Epoch 8/100
36/36 - 0s - loss: 0.3550 - accuracy: 0.8092 - val_loss: 0.3705 - val_accuracy: 0.7897 - 56ms/epoch - 2ms/step
Epoch 9/100
36

<keras.src.callbacks.History at 0x7ad9424e5360>

# <span style="color:#4040a1; font-family: Trebuchet MS; font-size: 40px; font-weight: bold;">Test</span>

In [None]:
# Evaluate the trained model on the test dataset and store the test loss and accuracy.
test_loss, test_accuracy = model.evaluate(test_inputs, test_targets)



In [None]:
print('\nTest loss: {0:.2f}. Test accuracy: {1:.2f}%'.format(test_loss, test_accuracy*100.))


Test loss: 0.31. Test accuracy: 84.38%
