In [1]:
import numpy as np
from sklearn import preprocessing

In [2]:
raw_data = np.loadtxt("FIFA19_preprocessed.csv", delimiter =",", skiprows=1)

unscaled_inputs = raw_data[:, 1:-1]
targets = raw_data[:, -1]

# Note: Since we are treating this as a regression problem, we will not be balancing the dataset in this case.

## Standardizing the inputs

In [3]:
scaler = preprocessing.StandardScaler()
scaled_inputs = scaler.fit_transform(unscaled_inputs)

## Shuffling the data

In [4]:
shuffled_indices = np.arange(scaled_inputs.shape[0])
np.random.shuffle(shuffled_indices)

shuffled_scaled_inputs = scaled_inputs[shuffled_indices]
shuffled_targets = targets[shuffled_indices]

In [5]:
print (shuffled_targets.shape)
print (shuffled_scaled_inputs.shape)

(18140,)
(18140, 328)


## Splitting dataset into Train, Validation & Test

Shall be splitting dataset into 80% Train, 10% Validation & 10% Test

In [6]:
shuffled_targets.shape

(18140,)

In [7]:
count = shuffled_scaled_inputs.shape[0]

train_sample_count = int(0.8 * count)
validation_sample_count = int(0.1 * count)
test_sample_count = count - train_sample_count - validation_sample_count

train_inputs = shuffled_scaled_inputs[:train_sample_count, :]
train_targets = shuffled_targets[:train_sample_count]

validation_inputs = shuffled_scaled_inputs[train_sample_count:validation_sample_count + train_sample_count, :]
validation_targets = shuffled_targets[train_sample_count:validation_sample_count + train_sample_count]

test_inputs = shuffled_scaled_inputs[validation_sample_count + train_sample_count:test_sample_count + validation_sample_count + train_sample_count, :]
test_targets = shuffled_targets[validation_sample_count + train_sample_count:test_sample_count + validation_sample_count + train_sample_count]

print (train_targets.shape[0])
print (validation_targets.shape[0])
print (test_targets.shape[0])

print ((test_targets.shape[0] + validation_targets.shape[0] + train_targets.shape[0])/count)

14512
1814
1814
1.0


##### So we have:

1. Training set: 14512 input/targets

2. Validation set: 1814 input/targets

3. Testing set: 1814 input/targets

## Saving the 3 datasets in *.npz

In [8]:
np.savez("FIFA19_data_train", inputs=train_inputs, targets=train_targets)
np.savez("FIFA19_data_validation", inputs=validation_inputs, targets=validation_targets)
np.savez("FIFA19_data_test", inputs=test_inputs, targets=test_targets)