# Processing Buoy 46214

Here I will process buoy 46214 using data utilities

In [None]:
buoy_num = 46214
buoy_str = '46214'

In [None]:
from data_utils.data_processing import *

obs_file_46214 = '/home/hutch_research/data/waves/buoys_20190315/CDIPObserved_46214_hourly.mat'
ww3_file_46214 = '/home/hutch_research/data/waves/buoys_20190315/WW3CFSRphase2_46214_rebanded.mat'

# Generate these for access to timegaps
buoy_object = ObservedDatasetFromFile(obs_file_46214)
glob_object = WW3DatasetFromFile(ww3_file_46214)

# Generate the shared 
obs_times, ww3_times, buoy_tensor, global_tensor = generate_combined_datasets(obs_file_46214, ww3_file_46214)

# just extract the first result
obs_times = obs_times[0]
ww3_times = ww3_times[0]
buoy_tensor = buoy_tensor[0]
global_tensor = global_tensor[0]

## Analyzing times

**Let's look at the time ranges we're working with**

In [3]:
from data_utils.matlab_datenums import matlab_datenum_to_py_date as mdtm

print("obs start:", mdtm(obs_times[0]), "end:", mdtm(obs_times[-1]), sep='\t')
print("ww3 start:", mdtm(ww3_times[0]), "end:", mdtm(ww3_times[-1]), sep='\t')

obs start:	1996-12-06 02:00:00.000003	end:	2009-04-19 02:00:00.000003
ww3 start:	1996-12-06 01:59:59.999993	end:	2009-04-19 01:59:59.999993


**Now we need to analyze the time gaps**
A gap is stored as an array of arrays:

`[ [[start date, end data], [start index, end index]], 
   ...
   [[start date, end data], [start index, end index]], 
 ]`

In [4]:
# check the buoy time gaps first
gaps = buoy_object.time_gaps
num_gaps = len(gaps)
print("Number of gaps:", num_gaps)

for i in range(num_gaps):
    cur_gap = gaps[i]
    cur_gap_dates = cur_gap[0]
    start = mdtm(cur_gap_dates[0])
    end = mdtm(cur_gap_dates[1])

    start_dtm = str(start.year) + "-" + str(start.month).zfill(2) + "-" + str(start.day).zfill(2) + " " + str(start.hour).zfill(2) + ":"  + str(start.minute).zfill(2)
    end_dtm = str(end.year) + "-" + str(end.month).zfill(2) + "-" + str(end.day).zfill(2) + " " + str(end.hour).zfill(2) + ":"  + str(end.minute).zfill(2)
    
    print("gap:", str(i).zfill(2), "  start:", start, "[%d]" % cur_gap[1][0], "  end:", end, "[%d]" % cur_gap[1][1])

Number of gaps: 11
gap: 00   start: 1993-09-01 00:00:00 [0]   end: 1996-12-06 02:00:00.000003 [28610]
gap: 01   start: 1997-09-20 12:59:59.999997 [35533]   end: 1997-10-13 21:00:00 [36093]
gap: 02   start: 1998-10-21 03:00:00 [45027]   end: 1998-11-04 08:00:00.000003 [45368]
gap: 03   start: 1999-09-12 08:00:00.000003 [52856]   end: 1999-10-18 18:59:59.999997 [53731]
gap: 04   start: 2002-12-14 18:59:59.999997 [81403]   end: 2003-01-10 18:00:00 [82050]
gap: 05   start: 2004-02-12 03:59:59.999997 [91588]   end: 2004-04-11 15:59:59.999997 [93016]
gap: 06   start: 2005-10-30 11:00:00.000003 [106619]   end: 2005-11-17 20:00:00.000003 [107060]
gap: 07   start: 2007-05-28 04:00:00.000007 [120412]   end: 2007-06-29 20:00:00.000003 [121196]
gap: 08   start: 2009-04-19 03:00:00 [137019]   end: 2009-05-21 22:00:00.000007 [137806]
gap: 09   start: 2014-09-05 11:00:00.000003 [184187]   end: 2014-10-08 18:00:00 [184986]
gap: 10   start: 2015-05-29 05:00:00.000003 [190565]   end: 2015-12-02 18:00:00

In [5]:
# Check the ww3 for time gaps -- it is very unlikely it has them

gaps = glob_object.time_gaps
num_gaps = len(gaps)
print("Number of gaps:", num_gaps)

if num_gaps > 0:
    for i in range(num_gaps):
        cur_gap = gaps[i]
        cur_gap_dates = cur_gap[0]
        start = mdtm(cur_gap_dates[0])
        end = mdtm(cur_gap_dates[1])

        start = str(start.year) + "-" + str(start.month).zfill(2) + "-" + str(start.day).zfill(2) + " " + str(start.hour).zfill(2) + ":"  + str(start.minute).zfill(2)
        end = str(end.year) + "-" + str(end.month).zfill(2) + "-" + str(end.day).zfill(2) + " " + str(end.hour).zfill(2) + ":"  + str(end.minute).zfill(2)

        print("gap:", str(i).zfill(2), "  start:", start, "[%d]" % cur_gap[1][0], "  end:", end, "[%d]" % cur_gap[1][1])

Number of gaps: 0


But the shared date range in our dataset is only:

`1996-12-06 02:00:00`	to	`2009-04-19 02:00:00`

*Note that in the data preprocessing `generate_combined_datasets` actually took this into account*

## Saving the dataset to a file

I created a method called `save_single_set_to_npz(filename, time_vector, obs_tensor, ww3_tensor, compressed=False)`

Typically I would save this with `obs_times` followed by the tensors according to their values.  The problem that I didn't solve early was managing a way to organize these into train, dev, and test.  I'll do a simple save here, and then walk through dividing up the data into train, dev, test.

Special note on this --> I also had to create a method later con called `convert_to_third_order_tensor(some_data)`, we'll need this :-\
   

In [6]:
outdir = 'data/'
suffix = str(datetime.date.today()) + '.npz'
filename = outdir + buoy_str + '_combined_data_' + suffix

save_single_set_to_npz(filename, obs_times, convert_to_third_order_tensor(buoy_tensor), convert_to_third_order_tensor(global_tensor))

print("done... saved to " + filename)

done... saved to data/46214_combined_data_2019-03-29.npz


## Setting up Train, Dev, and Test

Now I need to create the Train, Dev, and Test sets.
This is going to involve some manual analysis to divide up the results.

In [7]:
threshold = 0.04167  # relative threshold between hours: 0.04166666662786156

num_times = len(obs_times)
print("Number of time entries: %d\n" % num_times)

seq_start = 0
seq_end = 0

for i in range(len(obs_times) - 1):
    cur = obs_times[i]
    nxt = obs_times[i + 1]
    if nxt - cur > threshold:
        seq_end = i
        
        print(i, mdtm(obs_times[seq_start]), mdtm(obs_times[seq_end]), obs_times[seq_start], obs_times[seq_end], sep='\t')
        seq_start = seq_end + 1
    



Number of time entries: 103333

6922	1996-12-06 02:00:00.000003	1997-09-20 12:00:00	729365.0833333334	729653.5
15856	1997-10-13 21:00:00	1998-10-21 02:00:00.000003	729676.875	730049.0833333334
23344	1998-11-04 08:00:00.000003	1999-09-12 06:59:59.999997	730063.3333333334	730375.2916666666
51016	1999-10-18 18:59:59.999997	2002-12-14 18:00:00	730411.7916666666	731564.75
60554	2003-01-10 18:00:00	2004-02-12 03:00:00	731591.75	731989.125
74157	2004-04-11 15:59:59.999997	2005-10-30 09:59:59.999997	732048.6666666666	732615.4166666666
87509	2005-11-17 20:00:00.000003	2007-05-28 03:00:00	732633.8333333334	733190.125


### Analysis results
After doing analysis on this, see the file `data/obs_ww3_stats.ods`.

Here are the following allocations:
* Train -> idx_range(0:74158)
* Dev -> idx_range(74158:87510)
* Test -> idx_range(87510:-1)

In [9]:
outdir = 'data/'
suffix = str(datetime.date.today()) + '.npz'
trn_filename = outdir + buoy_str + '_waves_TRAIN_' + suffix
dev_filename = outdir + buoy_str + '_waves_DEV_' + suffix
tst_filename = outdir + buoy_str + '_waves_TEST_' + suffix

trn_beg = 0
trn_end = 74158
dev_beg = trn_end
dev_end = 87510
tst_beg = dev_end
tst_end = -1

# save_single_set_to_npz(filename, obs_times, convert_to_third_order_tensor(buoy_tensor), convert_to_third_order_tensor(global_tensor))

buoy_tens3o = convert_to_third_order_tensor(buoy_tensor)
glob_tens3o = convert_to_third_order_tensor(global_tensor)

# save train
save_single_set_to_npz(trn_filename, 
                       obs_times[trn_beg:trn_end], 
                       buoy_tens3o[trn_beg:trn_end, :, :], 
                       glob_tens3o[trn_beg:trn_end, :, :])
print("... saved TRAIN: " + trn_filename)

# save dev
save_single_set_to_npz(dev_filename, 
                       obs_times[dev_beg:dev_end], 
                       buoy_tens3o[dev_beg:dev_end, :, :], 
                       glob_tens3o[dev_beg:dev_end, :, :])
print("... saved DEV: " + dev_filename)


# save test
save_single_set_to_npz(tst_filename, 
                       obs_times[tst_beg:tst_end],
                       buoy_tens3o[tst_beg:tst_end, :, :], 
                       glob_tens3o[tst_beg:tst_end, :, :])
print("... saved TEST: " + tst_filename)

... saved TRAIN: data/46214_waves_TRAIN_2019-03-29.npz
... saved DEV: data/46214_waves_DEV_2019-03-29.npz
... saved TEST: data/46214_waves_TEST_2019-03-29.npz
