# The Titanic example - Preprocessing

### Import the relevant libraries

In [26]:
import numpy as np
import pandas as pd
import os
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder

### Extract the data from the csv

In [25]:
path = os.path.join(os.pardir, 'data/titanic.csv')

titanic_dataset = pd.read_csv(path)


le = LabelEncoder() #To encoder the sex column
titanic_dataset['Sex'] = le.fit_transform(titanic_dataset['Sex'])

titanic_inputs = titanic_dataset[['Pclass', 'Sex', 'Age','Siblings/Spouses', 'Parents/Children', 'Fare']].values

titanic_targets = titanic_dataset['Survived'].values

### Balance the dataset 

In [43]:
num_samples = titanic_targets.shape[0]
num_one_targets = titanic_targets.sum()
zero_targets_counter = 0

indices_to_remove = []

for i in range(num_samples):
    if titanic_targets[i] == 0:
        zero_targets_counter +=1
        if zero_targets_counter > num_one_targets:
            indices_to_remove.append(i)

inputs_equal_priors = np.delete(titanic_inputs, indices_to_remove, axis=0)
targets_equal_priors = np.delete(titanic_targets, indices_to_remove, axis=0)

num_samples = targets_equal_priors.shape[0]

### Standardize the inputs 

In [69]:
scaled_inputs = preprocessing.scale(inputs_equal_priors)

### Shuffle the data

In [70]:
shuffled_indices = np.arange(num_samples)
np.random.shuffle(shuffled_indices)

shuffled_inputs = scaled_inputs[shuffled_indices]
shuffled_targets = targets_equal_priors[shuffled_indices]

### Split the dataset into train, validation, and test

In [64]:
num_train_samples = int(0.75 * num_samples)
num_validation_samples = int(0.15 * num_samples)
num_test_samples = num_samples - num_train_samples - num_validation_samples

train_inputs = shuffled_inputs[:num_train_samples]
train_targets = shuffled_targets[:num_train_samples]

validation_inputs = shuffled_inputs[num_train_samples: num_train_samples + num_validation_samples]
validation_targets = shuffled_targets[num_train_samples: num_train_samples + num_validation_samples]

test_inputs = shuffled_inputs[num_train_samples + num_validation_samples:]
test_targets = shuffled_targets[num_train_samples + num_validation_samples:]

print(np.sum(train_targets), num_train_samples, np.sum(train_targets) / num_train_samples)
print(np.sum(validation_targets), num_validation_samples, np.sum(validation_targets) / num_validation_samples)
print(np.sum(test_targets), num_test_samples, np.sum(test_targets) / num_test_samples)

259 513 0.5048732943469786
50 102 0.49019607843137253
33 69 0.4782608695652174


In [65]:
np.savez('Titanic_data_train', inputs=train_inputs, targets=train_targets)
np.savez('Titanic_data_validation', inputs=validation_inputs, targets=validation_targets)
np.savez('Titanic_data_test', inputs=test_inputs, targets=test_targets)