In [1]:
import numpy as np
from sklearn import preprocessing

In [2]:
raw_csv_data = np.loadtxt('Business_case_dataset.csv', delimiter = ',')
print(raw_csv_data)


[[8.7300e+02 2.1600e+03 2.1600e+03 ... 0.0000e+00 0.0000e+00 1.0000e+00]
 [6.1100e+02 1.4040e+03 2.8080e+03 ... 0.0000e+00 1.8200e+02 1.0000e+00]
 [7.0500e+02 3.2400e+02 3.2400e+02 ... 1.0000e+00 3.3400e+02 1.0000e+00]
 ...
 [2.8671e+04 1.0800e+03 1.0800e+03 ... 0.0000e+00 2.9000e+01 0.0000e+00]
 [3.1134e+04 2.1600e+03 2.1600e+03 ... 0.0000e+00 0.0000e+00 0.0000e+00]
 [3.2832e+04 1.6200e+03 1.6200e+03 ... 0.0000e+00 9.0000e+01 0.0000e+00]]


In [3]:
unscaled_inputs_all = raw_csv_data[:,0:-1]# store dataset but exclude target column
print(unscaled_inputs_all) 

[[8.7300e+02 2.1600e+03 2.1600e+03 ... 0.0000e+00 0.0000e+00 0.0000e+00]
 [6.1100e+02 1.4040e+03 2.8080e+03 ... 0.0000e+00 0.0000e+00 1.8200e+02]
 [7.0500e+02 3.2400e+02 3.2400e+02 ... 0.0000e+00 1.0000e+00 3.3400e+02]
 ...
 [2.8671e+04 1.0800e+03 1.0800e+03 ... 0.0000e+00 0.0000e+00 2.9000e+01]
 [3.1134e+04 2.1600e+03 2.1600e+03 ... 0.0000e+00 0.0000e+00 0.0000e+00]
 [3.2832e+04 1.6200e+03 1.6200e+03 ... 0.0000e+00 0.0000e+00 9.0000e+01]]


In [4]:
targets_all = raw_csv_data[:,-1] #store only target column
print(targets_all)

[1. 1. 1. ... 0. 0. 0.]


In [5]:
num_one_targets = int(np.sum(targets_all)) # we count how many targets are 1 (meaning that the customer did buy again)
zero_targets_counter=0 #we set a counter for targets that are 0 (meaning that the customer did not buy again)
print(num_one_targets)

indices_to_remove = [] #we want to create a "balanced" dataset, so we will have to remove some input/target pairs, and we declare a variable that will do that

for i in range(targets_all.shape[0]):
    if targets_all[i] == 0: #we count that number of targets that are 0
        zero_targets_counter+= 1
        if zero_targets_counter > num_one_targets: #once there are as many 0's as 1's, we mark entries where target is 0
            indices_to_remove.append(i)

#we create 2 new variables, one that will contain the inputs, and one that will contain the targets
#we delete all indices that we marked "to remove" in the loop above
unscaled_inputs_equal_priors = np.delete(unscaled_inputs_all, indices_to_remove, axis = 0)
targets_equal_priors = np.delete(targets_all, indices_to_remove, axis = 0)

2237


In [6]:
#standardize the inputs (can try to run the algo without this line)
scaled_inputs = preprocessing.scale(unscaled_inputs_equal_priors) 
print(scaled_inputs)

[[-1.45560485  1.18956512  0.36398846 ... -0.8635056  -0.20536617
  -0.77240946]
 [-1.48242424 -0.33022754  1.10843845 ... -0.8635056  -0.20536617
   1.16499791]
 [-1.47280202 -2.50135991 -1.74528653 ... -0.8635056   2.23179102
   2.78305242]
 ...
 [ 1.73938284  1.18956512  0.36398846 ... -0.20129479 -0.20536617
  -0.62337812]
 [ 1.7477767   1.18956512  0.36398846 ... -0.20129479 -0.20536617
   0.21758442]
 [ 1.73559537  1.18956512  0.36398846 ... -0.20129479 -0.20536617
  -0.51692717]]


In [7]:
#Shuffle the data
shuffled_indices = np.arange(scaled_inputs.shape[0]) #arange the indices from 0 - scaled.input max row # with a step of 1
print(shuffled_indices)
np.random.shuffle(shuffled_indices) #shuffle the sequence of indices

#aligns the inputs and targets with the shuffled_indices
shuffled_inputs = scaled_inputs[shuffled_indices]
shuffled_targets = targets_equal_priors[shuffled_indices]
print(shuffled_indices)
print(shuffled_inputs)

[   0    1    2 ... 4471 4472 4473]
[4280 2156 3735 ...   28 3776 1351]
[[ 0.24292234 -0.76445401 -0.75268653 ...  0.24017908 -0.20536617
   2.42111918]
 [-1.40954102 -0.76445401 -0.75268653 ... -0.8635056  -0.20536617
  -0.36789583]
 [-0.74939506  1.18956512  0.36398846 ...  0.49402656 -0.20536617
  -0.72982908]
 ...
 [-1.50750344 -1.8500202  -1.37306153 ... -0.8635056  -0.20536617
  -0.77240946]
 [-0.75277307  0.10399894 -0.25638654 ...  0.49402656 -0.20536617
  -0.77240946]
 [ 0.52954177  1.18956512  0.36398846 ... -0.8635056  -0.20536617
  -0.77240946]]


In [8]:
#splitting the data
#count the samples in each subset. We want 80-10-10 distribution of training, validation, and testing
samples_count = shuffled_inputs.shape[0]
train_samples_count = int(0.8*samples_count) #set train sample count to 80% of the original input sample
validation_samples_count = int(0.1*samples_count)#set validation sample count to 10% of the original input sample
test_samples_count = samples_count- train_samples_count-validation_samples_count #test dataset containing all remaining data

#create variables that record the inputs and targets for training
train_inputs = shuffled_inputs[:train_samples_count]#grab 80% of the shuffled inputs 1 - - - - - - - - 8 - 10
train_targets = shuffled_targets[:train_samples_count]#grab 80% of the shuffled targets 1 - - - - - - - - 8 - 10

#create variables that record the inputs and targets for validation
validation_inputs = shuffled_inputs[train_samples_count:train_samples_count+validation_samples_count]
validation_targets = shuffled_targets[train_samples_count:train_samples_count+validation_samples_count]

#create variables that record the inputs and targets for testing 
test_inputs = shuffled_inputs[train_samples_count+validation_samples_count:]
test_targets = shuffled_targets[train_samples_count+validation_samples_count:]

print(np.sum(train_targets),train_samples_count,np.sum(train_targets)/train_samples_count)
print(np.sum(validation_targets),validation_samples_count,np.sum(validation_targets)/validation_samples_count)
print(np.sum(test_targets),test_samples_count,np.sum(test_targets)/test_samples_count)

1807.0 3579 0.504889633975971
217.0 447 0.4854586129753915
213.0 448 0.47544642857142855


In [9]:
#Saving in .npz format
np.savez('Audiobooks_data_train', inputs=train_inputs,targets=train_targets)
np.savez('Audiobooks_data_validation', inputs=validation_inputs,targets=validation_targets)
np.savez('Audiobooks_data_validation', inputs=validation_inputs,targets=validation_targets)

