In [2]:
import numpy as np 
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [5]:
data_path = 'dataset/dataset.csv'
split_data_path = 'data/data_nn_seq/'
#Load the dataset 
df = pd.read_csv(data_path)
df.shape

(155734, 41)

In [6]:
# Split the dataset to train, validation, and test sets 
# Define a list of port calls
port_calls = df.loc[:,'ROW_ID'].unique()


# Define the split indexes using the ratio 5:2.5:2.5
indx_1 = round(len(port_calls)*0.5)
indx_2 = indx_1 + round(len(port_calls)*0.25)

#define the voyages for train, validation, and test sets 
train_calls = port_calls[: indx_1]
valid_calls = port_calls[indx_1 : indx_2+1]
test_calls = port_calls[indx_2 +1 :]

print('The number of examples/voyages in the train set is:', len(train_calls))
print('The number of examples/voyages in the validation set is:', len(valid_calls))
print('The number of examples/voyages in the test set is:', len(test_calls))

#split the data into train, validation, and test 
train = df[df['ROW_ID'].isin(train_calls)]
valid = df[df['ROW_ID'].isin(valid_calls)]
test = df[df['ROW_ID'].isin(test_calls)]

#drop the port call identifier 'ROW_ID

train = train.drop(['ROW_ID'], axis=1)
valid = valid.drop(['ROW_ID'], axis=1)
test = test.drop(['ROW_ID'], axis=1)

print('')
print('The train set shape is:', train.shape)
print('The validation set shape is:', valid.shape)
print('The test set shape is:', test.shape)

The number of examples/voyages in the train set is: 1417
The number of examples/voyages in the validation set is: 709
The number of examples/voyages in the test set is: 708

The train set shape is: (77122, 40)
The validation set shape is: (38835, 40)
The test set shape is: (39777, 40)


In [7]:
# Standardize the data
scaled_features_train = train.copy()
scaled_features_valid = valid.copy()
scaled_features_test = test.copy()

col_names = ['Max_Draught','Latitude','Longitude','Speed_over_Ground','GT','DWT','LOA','BEAM','COG_cos','COG_sin','TH_cos','TH_sin', 'current_uo', 'current_vo','wind_u10', 'wind_v10',
'mwd', 'mwp', 'swh','sst', 'Age','Origin_Lat','Origin_Lon', 'acc_dist','acc_time_hours','leg_distance','leg_speed','leg_elapsed_time_hours','remaining_distance']

features_train = scaled_features_train[col_names]
features_valid = scaled_features_valid[col_names]
features_test = scaled_features_test[col_names]

scaler = StandardScaler().fit(features_train.values)

features_train = scaler.transform(features_train.values)
features_valid = scaler.transform(features_valid.values)
features_test = scaler.transform(features_test.values)

scaled_features_train[col_names] = features_train
scaled_features_valid[col_names] = features_valid
scaled_features_test[col_names] = features_test


In [8]:
y_train = scaled_features_train.pop("Target")
y_valid = scaled_features_valid.pop("Target")
y_test = scaled_features_test.pop("Target")

y_train = np.array(y_train).reshape((len(y_train),1))
y_valid = np.array(y_valid).reshape((len(y_valid),1))
y_test = np.array(y_test).reshape((len(y_test),1))

x_train = np.array(scaled_features_train).reshape((len(scaled_features_train),scaled_features_train.shape[1]))
x_valid = np.array(scaled_features_valid).reshape((len(scaled_features_valid),scaled_features_valid.shape[1]))
x_test = np.array(scaled_features_test).reshape((len(scaled_features_test),scaled_features_test.shape[1]))

In [9]:
print('x_train shape:', x_train.shape)
print('x_valid shape:', x_valid.shape)
print('x_test shape:', x_test.shape)

x_train shape: (77122, 39)
x_valid shape: (38835, 39)
x_test shape: (39777, 39)


In [None]:
np.save(split_data_path + 'x_train', x_train)
np.save(split_data_path + 'x_test', x_test)
np.save(split_data_path + 'x_valid', x_valid)
np.save(split_data_path + 'y_train', y_train)
np.save(split_data_path + 'y_test', y_test)
np.save(split_data_path + 'y_valid', y_valid)