In [1]:
import numpy as np
from sklearn import preprocessing

In [2]:
raw_data = np.loadtxt("FIFA19_preprocessed.csv", delimiter =",", skiprows=1)

unscaled_inputs = raw_data[:, 1:-1]
targets = raw_data[:, -1]

# Note: Since we are treating this as a regression problem, we will not be balancing the dataset in this case.

## Standardizing the inputs

In [3]:
scaler = preprocessing.StandardScaler()
scaled_inputs = scaler.fit_transform(unscaled_inputs)

## Shuffling the data

In [4]:
shuffled_indices = np.arange(scaled_inputs.shape[0])
np.random.shuffle(shuffled_indices)

shuffled_scaled_inputs = scaled_inputs[shuffled_indices]
shuffled_targets = targets[shuffled_indices]

In [5]:
print (shuffled_targets.shape)
print (shuffled_scaled_inputs.shape)

(18140,)
(18140, 328)


In [6]:
shuffled_scaled_inputs[5:7]

array([[-1.14812072e-01, -2.39773530e-01,  2.13796869e+00,
         3.25254293e+00,  2.28261479e+00,  1.23602642e+00,
         5.49325189e-01,  2.27651576e+00,  8.04040064e-02,
         8.45983433e-01,  6.68692119e-01, -1.07175379e-01,
         6.68692119e-01, -3.66783300e-01,  6.68692119e-01,
        -1.06383216e-01,  8.31650671e-01, -1.46275071e-01,
         7.55015137e-01, -2.87677981e-02,  7.55015137e-01,
        -6.40007196e-02,  7.55015137e-01, -2.87677981e-02,
         8.31650671e-01, -1.44296949e-01,  7.89318941e-01,
        -3.40441531e-02,  7.89318941e-01, -2.35996995e-01,
         7.89318941e-01, -3.40441531e-02,  9.45334292e-01,
        -2.53459490e-01,  9.41255972e-01, -1.49197156e-01,
         9.41255972e-01, -2.88519947e-01,  9.41255972e-01,
        -1.48423078e-01,  9.45334292e-01, -2.56890698e-01,
         1.36683372e+00, -6.57149804e-02,  1.28303470e+00,
        -1.16523365e-01,  1.28303470e+00, -2.34823181e-01,
         1.28303470e+00, -1.17732508e-01,  1.36683372e+0

In [7]:
shuffled_scaled_inputs[5:7, :]

array([[-1.14812072e-01, -2.39773530e-01,  2.13796869e+00,
         3.25254293e+00,  2.28261479e+00,  1.23602642e+00,
         5.49325189e-01,  2.27651576e+00,  8.04040064e-02,
         8.45983433e-01,  6.68692119e-01, -1.07175379e-01,
         6.68692119e-01, -3.66783300e-01,  6.68692119e-01,
        -1.06383216e-01,  8.31650671e-01, -1.46275071e-01,
         7.55015137e-01, -2.87677981e-02,  7.55015137e-01,
        -6.40007196e-02,  7.55015137e-01, -2.87677981e-02,
         8.31650671e-01, -1.44296949e-01,  7.89318941e-01,
        -3.40441531e-02,  7.89318941e-01, -2.35996995e-01,
         7.89318941e-01, -3.40441531e-02,  9.45334292e-01,
        -2.53459490e-01,  9.41255972e-01, -1.49197156e-01,
         9.41255972e-01, -2.88519947e-01,  9.41255972e-01,
        -1.48423078e-01,  9.45334292e-01, -2.56890698e-01,
         1.36683372e+00, -6.57149804e-02,  1.28303470e+00,
        -1.16523365e-01,  1.28303470e+00, -2.34823181e-01,
         1.28303470e+00, -1.17732508e-01,  1.36683372e+0

 #### For 2D arrays, _shuffled_scaled_inputs[5:7]_ and _shuffled_scaled_inputs[5:7, :]_ gives the same thing

In [8]:
shuffled_indices

array([11455, 11962, 14275, ...,  7371, 15774,  1605])

In [9]:
shuffled_scaled_inputs[0]

array([ 0.53155541,  1.04522086,  1.84814937,  1.3127719 ,  0.96048857,
        0.7887435 , -1.82041534, -0.28797507,  0.08040401, -0.47787263,
        0.1858185 , -0.10717538,  0.1858185 , -0.3667833 ,  0.1858185 ,
       -0.10638322,  0.08251489, -0.14627507,  0.04919472, -0.0287678 ,
        0.04919472, -0.06400072,  0.04919472, -0.0287678 ,  0.08251489,
       -0.14429695,  0.13198024, -0.03404415,  0.13198024, -0.235997  ,
        0.13198024, -0.03404415,  0.19442341, -0.25345949,  0.45822468,
       -0.14919716,  0.45822468, -0.28851995,  0.45822468, -0.14842308,
        0.19442341, -0.2568907 ,  0.88060013, -0.06571498,  1.09097481,
       -0.11652336,  1.09097481, -0.23482318,  1.09097481, -0.11773251,
        0.88060013, -0.06942005,  1.05581462, -0.28036807,  1.35534035,
       -0.19247209,  1.35534035,  3.03355809,  1.35534035, -0.19461805,
        1.05581462, -0.27680635,  0.28693389,  0.2798092 ,  1.36383145,
        0.63370884,  0.0620584 , -0.60136883,  0.37165024,  0.40

In [10]:
scaled_inputs[12706]

array([ 0.6493501 , -0.88227072,  1.12360107,  1.03566175,  0.34664425,
        1.04171499,  0.54932519, -0.28797507,  0.08040401,  0.84598343,
        0.66869212, -0.10717538,  0.66869212, -0.3667833 ,  0.66869212,
       -0.10638322,  0.69118771, -0.14627507,  0.75501514, -0.0287678 ,
        0.75501514, -0.06400072,  0.75501514, -0.0287678 ,  0.69118771,
       -0.14429695,  0.83627171, -0.03404415,  0.83627171, -0.235997  ,
        0.83627171, -0.03404415,  0.71067464, -0.25345949,  0.9895591 ,
       -0.14919716,  0.9895591 ,  3.46596487,  0.9895591 , -0.14842308,
        0.71067464, -0.2568907 ,  0.78335341, -0.06571498,  1.04295984,
       -0.11652336,  1.04295984, -0.23482318,  1.04295984, -0.11773251,
        0.78335341, -0.06942005,  0.81295642, -0.28036807,  0.88464236,
       -0.19247209,  0.88464236, -0.3296459 ,  0.88464236, -0.19461805,
        0.81295642, -0.27680635,  0.50482263,  0.94593682,  0.27053821,
        1.24631456,  0.23169114,  0.56245419,  0.96992934,  1.26

## Splitting dataset into Train, Validation & Test

Shall be splitting dataset into 80% Train, 10% Validation & 10% Test

In [11]:
shuffled_targets.shape

(18140,)

In [12]:
count = shuffled_scaled_inputs.shape[0]

train_sample_count = int(0.8 * count)
validation_sample_count = int(0.1 * count)
test_sample_count = count - train_sample_count - validation_sample_count

train_inputs = shuffled_scaled_inputs[:train_sample_count, :]
train_targets = shuffled_targets[:train_sample_count]

validation_inputs = shuffled_scaled_inputs[train_sample_count:validation_sample_count + train_sample_count, :]
validation_targets = shuffled_targets[train_sample_count:validation_sample_count + train_sample_count]

test_inputs = shuffled_scaled_inputs[validation_sample_count + train_sample_count:test_sample_count + validation_sample_count + train_sample_count, :]
test_targets = shuffled_targets[validation_sample_count + train_sample_count:test_sample_count + validation_sample_count + train_sample_count]

print (train_targets.shape[0])
print (validation_targets.shape[0])
print (test_targets.shape[0])

print ((test_targets.shape[0] + validation_targets.shape[0] + train_targets.shape[0])/count)

14512
1814
1814
1.0


##### So we have:

1. Training set: 14512 input/targets

2. Validation set: 1814 input/targets

3. Testing set: 1814 input/targets

## Saving the 3 datasets in *.npz

In [13]:
np.savez("FIFA19_data_train", inputs=train_inputs, targets=train_targets)
np.savez("FIFA19_data_validation", inputs=validation_inputs, targets=validation_targets)
np.savez("FIFA19_data_test", inputs=test_inputs, targets=test_targets)