In [1]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
import torch

## Loading the Data

In [2]:
raw_data = np.loadtxt('Audiobooks_data.csv', delimiter=",")

In [3]:
unscaled_inputs_all = raw_data[:,1:-1]
targets_all = raw_data[:,-1]

## Balancing the Dataset

In [4]:
num_one_targets = int(np.sum(targets_all))
zero_targets = 0
indices_to_remove = []
for i in range(targets_all.shape[0]):
    if targets_all[i] == 0:
        zero_targets += 1
        if zero_targets > num_one_targets:
            indices_to_remove.append(i)
unscaled_inputs_equal = np.delete(unscaled_inputs_all, indices_to_remove, axis=0)
targets_equal = np.delete(targets_all, indices_to_remove, axis=0)

Shape of the inputs and targets:

In [5]:
unscaled_inputs_equal.shape, targets_equal.shape

((4474, 10), (4474,))

## Standardizing the inputs

In [6]:
scaled_inputs_equal = preprocessing.scale(unscaled_inputs_equal)

## Shuffling the Data

In [7]:
shuffled_indices = np.arange(scaled_inputs_equal.shape[0])
np.random.shuffle(shuffled_indices)

shuffled_inputs = scaled_inputs_equal[shuffled_indices]
shuffled_targets = targets_equal[shuffled_indices]

## Splitting the data into train, validation and test

In [8]:
samples_count = shuffled_inputs.shape[0]

train_samples = int(0.8 * samples_count)
valid_samples = int(0.1 * samples_count)
test_samples = samples_count - (train_samples + valid_samples)

In [9]:
samples_count, train_samples, valid_samples, test_samples

(4474, 3579, 447, 448)

In [10]:
train_inputs = shuffled_inputs[:train_samples]
train_targets = shuffled_targets[:train_samples]

valid_inputs = shuffled_inputs[train_samples:train_samples+valid_samples]
valid_targets = shuffled_targets[train_samples:train_samples+valid_samples]

test_inputs = shuffled_inputs[train_samples+valid_samples:]
test_targets = shuffled_targets[train_samples+valid_samples:]

In [15]:
train_targets

array([1., 1., 1., ..., 0., 0., 1.], shape=(3579,))