#### *Import the libraries*

In [1]:
import numpy as np
from sklearn import preprocessing

np.random.seed(42)# Set seed for reproducibility
#Setting np.random.seed(42) ensures that NumPy generates the same sequence of random numbers every time, making the results reproducible. 
#This is useful for debugging and consistency in experiments.

#the number inside np.random.seed(42) is like a password that makes sure you
#get the same random results every time you run the code.

#dataset_provider-'UDEMY'

raw_csv_data = np.loadtxt('BusinessCase_AudioBook.csv', delimiter=',') #delimiter=',' specifies that commas are used to separate the values in the CSV file
unscaled_inputs_all = raw_csv_data[:,1:-1]
targets_all = raw_csv_data[:,-1]

In [2]:
# When the data was collected it was actually arranged by date
# Shuffle the indices of the data, so the data is not arranged in any way when we feed it.
# Since we will be batching, we want the data to be as randomly spread out as possible

#The primary reason why shuffling after balancing yields better results is that it ensures a uniform and unbiased mix of data points, 
#reducing the risk of overfitting and improving the generalization capabilities of your model.

#Synthetic Data Bias: If synthetic data is generated in clusters around the minority class examples, shuffling after balancing is necessary to mix these clusters with the rest of the data. 
#However, shuffling before balancing might not effectively mitigate any bias introduced during the balancing process.
shuffled_indices = np.arange(unscaled_inputs_all.shape[0])
np.random.shuffle(shuffled_indices)

# Use the shuffled indices to shuffle the inputs and targets.
unscaled_inputs_all = unscaled_inputs_all[shuffled_indices]
targets_all = targets_all[shuffled_indices]

#### Balacne the dataset

In [3]:
num_one_targets = int(np.sum(targets_all)) #total of '1' in the target column, no need to total '0' as adding 0 will result the same
zero_targets_counter = 0 #initializing the counter to add up total index with '0' later on
indices_to_remove = [] #initializing empty list 

for i in range(targets_all.shape[0]): #the loop goes till the last element in the target column which is given by .shape[0]
    if targets_all[i] == 0: #checking if the target value is '0'
        zero_targets_counter += 1 #if target value is '0' increase the counter
        if zero_targets_counter > num_one_targets: #if the number of '0' in the target column exceedd the total count of '1' (num_one_targets): 
            indices_to_remove.append(i) #we add the rows or the values in the target column to indices_to_remove to balance the dataset
            
unscaled_inputs_equal_priors = np.delete(unscaled_inputs_all, indices_to_remove, axis=0) #we remove the rows with extra '0' value in our input dataset

targets_equal_priors = np.delete(targets_all, indices_to_remove, axis=0) #we remove the rows with extra '0' value in our target dataset

In [4]:
print(int(np.sum(targets_all)))
print(targets_all.shape)

2237
(14084,)


#### Standardize the inputs

In [5]:
#Feature Scaling
#The scale function standardizes the features by removing the mean and scaling to unit variance. 
#This means each feature will have a mean of approximately 0 and a standard deviation of 1 after scaling.
scaled_inputs = preprocessing.scale(unscaled_inputs_equal_priors)
scaled_inputs

array([[ 0.11381403, -0.25009266,  0.63913456, ...,  1.33008883,
        -0.20200263, -0.7577214 ],
       [-0.64487159,  0.74020675, -0.23213717, ..., -0.43799792,
        -0.20200263,  2.80337588],
       [-0.75325525, -0.74524236, -0.15076555, ...,  1.33008883,
        -0.20200263, -0.7577214 ],
       ...,
       [-1.40355721,  0.86399418, -0.36312563, ..., -0.43799792,
        -0.20200263,  1.95910357],
       [ 1.19765064,  0.36884447, -0.36312563, ...,  0.04420756,
        -0.20200263,  0.79011116],
       [-1.94547551, -0.74524236, -0.36312563, ..., -0.43799792,
        -0.20200263,  0.33550299]])

#### Shuffle the data

In [6]:
#Shuffling the data ensures that there's no inherent order in the dataset that could bias the learning process.
shuffled_indices = np.arange(scaled_inputs.shape[0]) #np.arange(scaled_inputs.shape[0]) creates an array of indices from 0 to the number of rows in scaled_inputs
np.random.shuffle(shuffled_indices) #p.random.shuffle(shuffled_indices) shuffles the indices randomly, ensuring that they are in a random order

shuffled_inputs = scaled_inputs[shuffled_indices] #uses the shuffled indices to rearrange the rows of scaled_inputs, resulting in shuffled_inputs where the rows are shuffled randomly
shuffled_targets = targets_equal_priors[shuffled_indices] #shuffles the rows of targets_equal_priors according to the same shuffled indices, resulting in shuffled_targets

#### Split the dataset into train, validation, and test

In [7]:
samples_count = shuffled_inputs.shape[0] #count the number of samples/ .shape[0]counts the number of rows  

train_samples_count = int(0.8*samples_count) #train data, 80% of samples_count
validation_samples_count = int(0.1*samples_count) #validation data, 10% of samples_count
test_samples_count = samples_count - train_samples_count -validation_samples_count #test data, rest of the samples_count(10%)

train_inputs = shuffled_inputs[:train_samples_count] # [0 : 80%] of the data from shuffled_inputs
train_targets = shuffled_targets[:train_samples_count] # [0 : 80%] of the data from shuffled_targets

validation_inputs = shuffled_inputs[train_samples_count: train_samples_count+validation_samples_count] # [80% : 90%] of data form shuffled_inputs
validation_targets = shuffled_targets[train_samples_count: train_samples_count+validation_samples_count] # [80% : 90%] of data from shuffled_targets

test_inputs = shuffled_inputs[train_samples_count+validation_samples_count:] #[90% : the end] of data from shuffled_inputs
test_targets = shuffled_targets[train_samples_count+validation_samples_count:] #[90% : the end] of data from shuffled_targets

#Check if the splitted dataset is balanced, (check the total '1', total sample, proportion of '1' in the data set afetr splitting(should be close to 50%))
print(np.sum(train_targets), train_samples_count, np.sum(train_targets) / train_samples_count)
print(np.sum(validation_targets), validation_samples_count, np.sum(validation_targets) / validation_samples_count)
print(np.sum(test_targets), test_samples_count, np.sum(test_targets) / test_samples_count)

1785.0 3579 0.49874266554903607
225.0 447 0.5033557046979866
227.0 448 0.5066964285714286


#### Save the 3 datasets in *.npz

In [8]:
np.savez('Audiobooks_data_train', inputs=train_inputs, targets=train_targets)
np.savez('Audiobooks_data_validation', inputs=validation_inputs, targets=validation_targets)
np.savez('Audiobooks_data_test', inputs=test_inputs, targets=test_targets)

#### Create the machine learning algorithm

In [9]:
import tensorflow as tf

#### Data

In [10]:
#npz is typically a dictionary-like object that results from loading an .npz file using np.load('filename.npz')
npz = np.load('Audiobooks_data_train.npz')
# We saved the NPZs in 2-tuple form [inputs, targets]
#.astype(float) converts the data type of the array to float. This ensures that all the values in the array are of floating-point type.
train_inputs = npz['inputs'].astype(float) # np.ndarray.astype() creates a copy of the array, cast to a specific type
train_targets = npz['targets'].astype(int)

npz = np.load('Audiobooks_data_validation.npz')
validation_inputs = npz['inputs'].astype(float)
validation_targets = npz['targets'].astype(int)

npz = np.load('Audiobooks_data_test.npz')
test_inputs = npz['inputs'].astype(float)
test_targets = npz['targets'].astype(int)

#### Model

In [11]:
input_size = 10
output_size = 2
hidden_layer_size = 50

model = tf.keras.Sequential([
    tf.keras.layers.Dense(hidden_layer_size, activation = 'relu', input_shape = (input_size,)),
    tf.keras.layers.Dense(hidden_layer_size, activation = 'relu'),
    tf.keras.layers.Dense(output_size, activation = 'softmax'), #we know our model is a classifier, therefore our output layer should be actviated with softmax 
])

model.compile(optimizer = 'adam', loss = 'sparse_categorical_crossentropy', metrics=['accuracy']) #sparse_categorical_crossentropy --> does one hot encoding to both train inputs and targets

batch_size = 100

max_epochs = 100

early_stopping = tf.keras.callbacks.EarlyStopping(patience=2) #By default, this object will monitor the validation loss and stop the training process the first time the validation loss starts increasing.
# tf.keras.callbacks.EarlyStopping(patience=2) configures the early stopping mechanism of the algorithm, 'patience' lets us decide how many consecutive increase in loss we can tolerate, here '2'

model.fit(train_inputs,
         train_targets,
         batch_size = batch_size,
         epochs = max_epochs,
         callbacks = [early_stopping],
         validation_data = (validation_inputs, validation_targets),
         verbose=2)

Epoch 1/100
36/36 - 1s - loss: 0.6387 - accuracy: 0.6334 - val_loss: 0.5634 - val_accuracy: 0.7204 - 860ms/epoch - 24ms/step
Epoch 2/100
36/36 - 0s - loss: 0.5090 - accuracy: 0.7457 - val_loss: 0.4789 - val_accuracy: 0.7494 - 120ms/epoch - 3ms/step
Epoch 3/100
36/36 - 0s - loss: 0.4538 - accuracy: 0.7622 - val_loss: 0.4471 - val_accuracy: 0.7539 - 128ms/epoch - 4ms/step
Epoch 4/100
36/36 - 0s - loss: 0.4258 - accuracy: 0.7784 - val_loss: 0.4360 - val_accuracy: 0.7494 - 86ms/epoch - 2ms/step
Epoch 5/100
36/36 - 0s - loss: 0.4113 - accuracy: 0.7902 - val_loss: 0.4172 - val_accuracy: 0.7673 - 92ms/epoch - 3ms/step
Epoch 6/100
36/36 - 0s - loss: 0.3972 - accuracy: 0.7944 - val_loss: 0.4118 - val_accuracy: 0.7718 - 113ms/epoch - 3ms/step
Epoch 7/100
36/36 - 0s - loss: 0.3947 - accuracy: 0.7890 - val_loss: 0.3961 - val_accuracy: 0.7875 - 146ms/epoch - 4ms/step
Epoch 8/100
36/36 - 0s - loss: 0.3812 - accuracy: 0.8050 - val_loss: 0.4100 - val_accuracy: 0.7673 - 140ms/epoch - 4ms/step
Epoch 9/1

<keras.callbacks.History at 0x1eaa0fc3910>

#### Test

In [12]:
test_loss, test_accuracy = model.evaluate(test_inputs, test_targets)



In [13]:
print(f'Test Loss: {round(test_loss, 2)}, Test Accuracy: {round(test_accuracy*100, 2)}%')

Test Loss: 0.35, Test Accuracy: 81.25%


In [14]:
#this line uses the trained model to make predictions on the test inputs. The model.predict() 
#method returns an array of probabilities for each class. 
#Since this is a binary classification problem, the output will be a 2D array where each row corresponds 
#to a sample and each column corresponds to the predicted probability for each class (0 or 1).
predictions = model.predict(train_inputs)

#This line extracts the probabilities of the positive class (class 1, which indicates the customer will buy again) from the predictions. 
#The [:, 1] indexing selects the second column (index 1) of the predictions array, 
#which contains the probabilities of class 1 for each sample.
conversion_probabilities = predictions[:, 1]

# Set a threshold for targeting customers
#This line sets a threshold value for classifying whether a customer is likely to buy again. 
#In this case, the threshold is set to 0.5, meaning if the predicted probability of buying again is greater than 0.5, 
#the customer will be classified as likely to buy again.
threshold = 0.5

# Determine which customers to target based on the threshold
target_customers = conversion_probabilities > threshold

# Print the results
for i, prob in enumerate(conversion_probabilities):
    print(f"Customer {i}: Probability of buying again = {prob:.2f}, Target = {'Yes' if prob > threshold else 'No'}")

Customer 0: Probability of buying again = 0.67, Target = Yes
Customer 1: Probability of buying again = 0.32, Target = No
Customer 2: Probability of buying again = 0.24, Target = No
Customer 3: Probability of buying again = 0.00, Target = No
Customer 4: Probability of buying again = 0.14, Target = No
Customer 5: Probability of buying again = 0.52, Target = Yes
Customer 6: Probability of buying again = 0.47, Target = No
Customer 7: Probability of buying again = 0.88, Target = Yes
Customer 8: Probability of buying again = 0.00, Target = No
Customer 9: Probability of buying again = 0.99, Target = Yes
Customer 10: Probability of buying again = 0.00, Target = No
Customer 11: Probability of buying again = 1.00, Target = Yes
Customer 12: Probability of buying again = 0.72, Target = Yes
Customer 13: Probability of buying again = 0.00, Target = No
Customer 14: Probability of buying again = 0.72, Target = Yes
Customer 15: Probability of buying again = 0.00, Target = No
Customer 16: Probability of

In [15]:
# Evaluate different thresholds to find the best one using the validation set
predictions = model.predict(validation_inputs)
conversion_probabilities = predictions[:, 1] # contains predicted probabilities for second class of each sample

best_threshold = 0.5
best_accuracy = 0

# Try different thresholds from 0.1 to 0.9
for threshold in np.arange(0.1, 1.0, 0.1):
    predicted_classes = (conversion_probabilities > threshold).astype(int)
    accuracy = np.mean(predicted_classes == validation_targets)
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_threshold = threshold

print(f"Best threshold: {best_threshold}")
print(f"Validation accuracy with best threshold: {best_accuracy:.2f}")

# Use the best threshold to determine target customers in the training set
train_predictions = model.predict(train_inputs)
train_conversion_probabilities = train_predictions[:, 1]
train_target_customers = train_conversion_probabilities > best_threshold

# Print the results for the training set
for i, prob in enumerate(train_conversion_probabilities):
    print(f"Customer {i}: Probability of buying again = {prob:.2f}, Target = {'Yes' if prob > best_threshold else 'No'}")


Best threshold: 0.4
Validation accuracy with best threshold: 0.78
Customer 0: Probability of buying again = 0.67, Target = Yes
Customer 1: Probability of buying again = 0.32, Target = No
Customer 2: Probability of buying again = 0.24, Target = No
Customer 3: Probability of buying again = 0.00, Target = No
Customer 4: Probability of buying again = 0.14, Target = No
Customer 5: Probability of buying again = 0.52, Target = Yes
Customer 6: Probability of buying again = 0.47, Target = Yes
Customer 7: Probability of buying again = 0.88, Target = Yes
Customer 8: Probability of buying again = 0.00, Target = No
Customer 9: Probability of buying again = 0.99, Target = Yes
Customer 10: Probability of buying again = 0.00, Target = No
Customer 11: Probability of buying again = 1.00, Target = Yes
Customer 12: Probability of buying again = 0.72, Target = Yes
Customer 13: Probability of buying again = 0.00, Target = No
Customer 14: Probability of buying again = 0.72, Target = Yes
Customer 15: Probabil

In [16]:
#for i in range(len(train_conversion_probabilities)):
#    print(f"Customer {i}: Probability of buying again = {train_conversion_probabilities[i]:.2f}, Target = {'Yes' if train_conversion_probabilities[i] > best_threshold else 'No'}")

In [17]:
predicted_classes = (conversion_probabilities > threshold)
predicted_classes

array([False, False, False,  True,  True, False, False, False, False,
        True,  True, False, False, False, False, False,  True, False,
        True, False, False, False, False, False, False, False,  True,
       False, False, False,  True,  True, False, False, False,  True,
       False, False, False, False, False, False,  True, False, False,
       False, False, False, False, False, False, False,  True,  True,
       False, False,  True, False, False, False,  True, False, False,
       False,  True,  True,  True, False, False, False, False, False,
       False, False,  True,  True, False,  True, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False,  True, False, False, False, False, False, False,
       False,  True, False, False,  True, False, False, False, False,
       False, False, False, False, False, False,  True, False,  True,
       False,  True,