# Mount datasets to use

Notebook responsible to synchronize vessel response data and weather data into a unique dataset.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import timedelta
import time

In [2]:
def read_files_into_dataframes(path, filename, date_column_name):
    """
    This function read a csv file into, considering a specific column as date e set it as index.
    
    args:
    path (str) -> Base directory.
    filename (str) -> .csv file located into path.
    date_column_name (str) -> column name with time information.
    
    return:
    DataFrame -> return a dataframe with date_column_name as index and in datetime format.
    
    """
    
    if filename.split('.')[-1] != 'csv':
        
        raise ValueError('Enter a valid csv filename')
    
    df = pd.read_csv(''.join([path, filename]), parse_dates=[date_column_name], dtype='float32')
    df.set_index(date_column_name, inplace=True)
    return df

In [3]:
def filter_validation_time(df, is_train=True, min_date='', max_date='', date_column_name='Date'):
    """
    Filter df by validation time considering training or validation dataset.
    
    args:
    
    df (pd.DataFrame) -> data frame with complete data including all period extract from process and lab.
    is_train (bool) -> define if df we'll be used to train or to validation.
    min_date -> smallest date in validation period.
    max_date -> highest date in validation period.
    
    returns:
    
    pd.DataFrame -> df filtered by date to be used to train or validade.
    
    """
    
    df = df.copy()
    
    if min_date == '':
        min_date = df.index.min()
        
    if max_date == '':
        max_date = df.index.max()
        
    is_greater_than_min_date = df.index >= min_date
    is_smaller_than_max_date = df.index <= max_date
    filter_date = is_greater_than_min_date & is_smaller_than_max_date
    
    if is_train:
        return df[~filter_date]
    
    else:
        return df[filter_date]

In [4]:
base_path = 'data\\'

# Reading Interpolated weather data.

In [5]:
weather_filename = 'weather_interpolated.csv'
weather_data = read_files_into_dataframes(base_path, weather_filename, 'Date')

# Reading resampled vessel response data.

In [6]:
vessel_response_filename = 'vessel_response_agg_by_second.csv'
vessel_response_data = read_files_into_dataframes(base_path, vessel_response_filename, 'Date')

# Mount a dataset joining weather and vessel response data.

In [7]:
complete_data = pd.merge(vessel_response_data, weather_data, how='left', on='Date')

In [8]:
data_time_interval = complete_data.index.max() - complete_data.index.min()
print(f'Total time interval :>> {data_time_interval}')
print(f'From: {complete_data.index.min()}')
print(f'To: {complete_data.index.max()}')

Total time interval :>> 19 days 23:59:59
From: 2020-08-15 00:00:00+00:00
To: 2020-09-03 23:59:59+00:00


# Dividing dataset into `train`, `test` and `validation`.

The dataset will be divided into 80% for `trainning`, 10% for `test` and 10% for `validation`. 

- Trainning (16 days) - `15/08/2020 00:00:00` - `30/08/2020 23:59:59`
- Test (2 days) - `31/08/2020 00:00:00` - `01/09/2020 23:59:59`
- Validation (2 days) - `02/09/2020 00:00:00` - `03/09/2020 23:59:59`

In [9]:
complete_data_train = filter_validation_time(complete_data,
                                             is_train=False,
                                             min_date='15/08/2020 00:00:00',
                                             max_date='30/08/2020 23:59:59')

In [10]:
complete_data_test = filter_validation_time(complete_data,
                                             is_train=False,
                                             min_date='08/31/2020 00:00:00',
                                             max_date='09/01/2020 23:59:59')

In [11]:
complete_data_validation = filter_validation_time(complete_data,
                                             is_train=False,
                                             min_date='09/02/2020 00:00:00',
                                             max_date='09/03/2020 23:59:59')

# Saving train data.

In [12]:
print(f'Number of points train: {len(complete_data_train)}')
print(f'Start date: {complete_data_train.index.min()}')
print(f'End date: {complete_data_train.index.max()}')
print(f'Percentual of data: {round((len(complete_data_train) / len(complete_data)) * 100, 2)}')
print('Saving file...')
train_filename = 'complete_data_train.csv'
complete_data_train.to_csv(''.join([base_path, train_filename]))
print('File Saved')

Number of points train: 1382400
Start date: 2020-08-15 00:00:00+00:00
End date: 2020-08-30 23:59:59+00:00
Percentual of data: 80.0
Saving file...
File Saved


# Saving test data.

In [13]:
print(f'Number of points train: {len(complete_data_test)}')
print(f'Start date: {complete_data_test.index.min()}')
print(f'End date: {complete_data_test.index.max()}')
print(f'Percentual of data: {round((len(complete_data_test) / len(complete_data)) * 100, 2)}')
print('Saving file...')
test_filename = 'complete_data_test.csv'
complete_data_test.to_csv(''.join([base_path, test_filename]))
print('File Saved')

Number of points train: 172800
Start date: 2020-08-31 00:00:00+00:00
End date: 2020-09-01 23:59:59+00:00
Percentual of data: 10.0
Saving file...
File Saved


# Saving validation data.

In [14]:
print(f'Number of points train: {len(complete_data_validation)}')
print(f'Start date: {complete_data_validation.index.min()}')
print(f'End date: {complete_data_validation.index.max()}')
print(f'Percentual of data: {round((len(complete_data_validation) / len(complete_data)) * 100, 2)}')
print('Saving file...')
valid_filename = 'complete_data_validation.csv'
complete_data_validation.to_csv(''.join([base_path, valid_filename]))
print('File Saved')

Number of points train: 172800
Start date: 2020-09-02 00:00:00+00:00
End date: 2020-09-03 23:59:59+00:00
Percentual of data: 10.0
Saving file...
File Saved
