# Importing libraries

In [1]:
import numpy as np
from sklearn import preprocessing

# Loading data

In [2]:
raw_csv_data = np.loadtxt('Audiobooks_data.csv', delimiter=',')
unscaled_inputs_all = raw_csv_data[:,1:-1] # taking inputs except first and last column
targets_all = raw_csv_data[:,-1] # last column only as it is our target

# Objective

We want to understand which customers may return to our audiobook service based on their application usage pattern.

# Balancing data

We need to balance data as our target labels are not proportional in size and this will affect our model performance heavily. So balance the data we will remove few records of users who did not convert.

In [3]:
num_one_targets = int(np.sum(targets_all))

zero_targets_counter = 0
indices_to_remove = []
for i in range(targets_all.shape[0]):
    if targets_all[i] == 0:
        zero_targets_counter += 1
        if zero_targets_counter > num_one_targets:
            indices_to_remove.append(i)

# 
unscaled_inputs_equal_priors = np.delete(unscaled_inputs_all, indices_to_remove, axis=0)
targets_equal_priors = np.delete(targets_all, indices_to_remove, axis=0)

# Standardize inputs

We should always standardize inputs as this helps to improve our model performance.

In [4]:
scaled_inputs = preprocessing.scale(unscaled_inputs_equal_priors)

# Shuffle the input

Shuffling the inputs to ensure that our data does not have sequence of patterns of conversions and non-conversions. This is important as while traininig model we will batch the inputs and the model should get equal proportions of targets in a batch while training.

In [5]:
shuffled_indices = np.arange(scaled_inputs.shape[0])
np.random.shuffle(shuffled_indices)

shuffled_inputs = scaled_inputs[shuffled_indices]
shuffled_targets = targets_equal_priors[shuffled_indices]

# Split train, test and validation set

In [6]:
samples_count = shuffled_inputs.shape[0]

train_samples_count = int(0.8*samples_count)
validation_samples_count = int(0.1*samples_count)
test_samples_count = samples_count - train_samples_count - validation_samples_count

train_inputs = shuffled_inputs[:train_samples_count]
train_targets = shuffled_targets[:train_samples_count]

validation_inputs = shuffled_inputs[train_samples_count:train_samples_count+validation_samples_count]
validation_targets = shuffled_targets[train_samples_count:train_samples_count+validation_samples_count]

test_inputs = shuffled_inputs[train_samples_count+validation_samples_count:]
test_targets = shuffled_targets[train_samples_count+validation_samples_count:]

# Check if data splits are balanced

In [7]:
print(np.sum(train_targets), train_samples_count, np.sum(train_targets)/train_samples_count)
print(np.sum(validation_targets), validation_samples_count, np.sum(validation_targets)/validation_samples_count)
print(np.sum(test_targets), test_samples_count, np.sum(test_targets)/test_samples_count)

1772.0 3579 0.4951103660240291
232.0 447 0.5190156599552572
233.0 448 0.5200892857142857


Output from above commands shows that our train, validation and test data sets have our proportions well distributed.

# Saving dataset in npz files

We will save the preprocessed datasets to be used later during model training.

In [8]:
np.savez('AB_train',inputs=train_inputs, targets=train_targets)
np.savez('AB_validation',inputs=validation_inputs, targets=validation_targets)
np.savez('AB_test',inputs=test_inputs, targets=test_targets)