# Processing Buoy 46218



In [1]:
buoy_num = 46218
buoy_str = '46218B'

In [2]:
from data_utils.data_processing import *

obs_file = '/home/hutch_research/data/waves/buoys_20190315/CDIPObserved_' + buoy_str + '_hourly.mat'
ww3_file = '/home/hutch_research/data/waves/buoys_20190315/WW3CFSRphase2_' + buoy_str + '_rebanded.mat'

# Generate these for access to timegaps
buoy_object = ObservedDatasetFromFile(obs_file)
glob_object = WW3DatasetFromFile(ww3_file)

# Generate the shared 
obs_times, ww3_times, buoy_tensor, global_tensor = generate_combined_datasets(obs_file, ww3_file)

# just extract the first result
obs_times = obs_times[0]
ww3_times = ww3_times[0]
buoy_tensor = buoy_tensor[0]
global_tensor = global_tensor[0]

## Analyzing times

**Let's look at the time ranges we're working with**

In [3]:
from data_utils.matlab_datenums import matlab_datenum_to_py_date as mdtm

print("obs start:", mdtm(obs_times[0]), "end:", mdtm(obs_times[-1]), sep='\t')
print("ww3 start:", mdtm(ww3_times[0]), "end:", mdtm(ww3_times[-1]), sep='\t')

obs start:	1995-12-01 02:00:00.000003	end:	2008-12-14 01:00:00.000007
ww3 start:	1995-12-01 01:59:59.999993	end:	2008-12-14 00:59:59.999997


In [4]:
print(buoy_object.a1.shape)
print(glob_object.a1.shape)

(216936, 64)
(90584, 64)


**Now we need to analyze the time gaps**
A gap is stored as an array of arrays:

`[ [[start date, end data], [start index, end index]], 
   ...
   [[start date, end data], [start index, end index]], 
 ]`

In [5]:
# check the buoy time gaps first
gaps = buoy_object.time_gaps
num_gaps = len(gaps)
print("Number of gaps:", num_gaps)

for i in range(num_gaps):
    cur_gap = gaps[i]
    cur_gap_dates = cur_gap[0]
    start = mdtm(cur_gap_dates[0])
    end = mdtm(cur_gap_dates[1])

    start_dtm = str(start.year) + "-" + str(start.month).zfill(2) + "-" + str(start.day).zfill(2) + " " + str(start.hour).zfill(2) + ":"  + str(start.minute).zfill(2)
    end_dtm = str(end.year) + "-" + str(end.month).zfill(2) + "-" + str(end.day).zfill(2) + " " + str(end.hour).zfill(2) + ":"  + str(end.minute).zfill(2)
    
    print("gap:", str(i).zfill(2), "  start:", start, "[%d]" % cur_gap[1][0], "  end:", end, "[%d]" % cur_gap[1][1])
    # print(end)

Number of gaps: 8
gap: 00   start: 1993-09-01 00:00:00 [0]   end: 1995-12-01 02:00:00.000003 [19706]
gap: 01   start: 1996-03-15 17:00:00.000003 [22241]   end: 1998-03-19 18:00:00 [39858]
gap: 02   start: 1998-09-30 00:59:59.999997 [44521]   end: 1998-10-07 18:00:00 [44706]
gap: 03   start: 2004-04-05 00:59:59.999997 [92857]   end: 2004-05-07 23:00:00.000003 [93647]
gap: 04   start: 2007-11-07 07:00:00.000007 [124327]   end: 2007-11-27 20:00:00.000003 [124820]
gap: 05   start: 2008-12-14 02:00:00.000003 [133994]   end: 2009-02-27 21:00:00 [135813]
gap: 06   start: 2010-11-23 20:00:00.000003 [151028]   end: 2010-12-03 19:00:00.000007 [151267]
gap: 07   start: 2016-03-03 12:00:00 [197268]   end: 2016-06-03 15:00:00 [199479]


In [6]:
# Check the ww3 for time gaps -- it is very unlikely it has them

gaps = glob_object.time_gaps
num_gaps = len(gaps)
print("Number of gaps:", num_gaps)

if num_gaps > 0:
    for i in range(num_gaps):
        cur_gap = gaps[i]
        cur_gap_dates = cur_gap[0]
        start = mdtm(cur_gap_dates[0])
        end = mdtm(cur_gap_dates[1])

        start = str(start.year) + "-" + str(start.month).zfill(2) + "-" + str(start.day).zfill(2) + " " + str(start.hour).zfill(2) + ":"  + str(start.minute).zfill(2)
        end = str(end.year) + "-" + str(end.month).zfill(2) + "-" + str(end.day).zfill(2) + " " + str(end.hour).zfill(2) + ":"  + str(end.minute).zfill(2)

        print("gap:", str(i).zfill(2), "  start:", start, "[%d]" % cur_gap[1][0], "  end:", end, "[%d]" % cur_gap[1][1])

Number of gaps: 0


But the shared date range in our dataset is only:

`1996-12-06 02:00:00`	to	`2009-04-19 02:00:00`

*Note that in the data preprocessing `generate_combined_datasets` actually took this into account*

## Saving the dataset to a file

I created a method called `save_single_set_to_npz(filename, time_vector, obs_tensor, ww3_tensor, compressed=False)`

Typically I would save this with `obs_times` followed by the tensors according to their values.  The problem that I didn't solve early was managing a way to organize these into train, dev, and test.  I'll do a simple save here, and then walk through dividing up the data into train, dev, test.

Special note on this --> I also had to create a method later con called `convert_to_third_order_tensor(some_data)`, we'll need this :-\
   

In [22]:
outdir = 'data/'
suffix = str(datetime.date.today()) + '.npz'
filename = outdir + buoy_str + '_combined_data_' + suffix

save_single_set_to_npz(filename, obs_times, convert_to_third_order_tensor(buoy_tensor), convert_to_third_order_tensor(global_tensor))

print("done... saved to " + filename)

done... saved to data/46218B_combined_data_2019-03-29.npz


## Setting up Train, Dev, and Test

Now I need to create the Train, Dev, and Test sets.
This is going to involve some manual analysis to divide up the results.

In [13]:
threshold = 0.04167  # relative threshold between hours: 0.04166666662786156

num_times = len(obs_times)
print("Number of time entries: %d\n" % num_times)

seq_start = 0
seq_end = 0

for i in range(len(obs_times) - 1):
    cur = obs_times[i]
    nxt = obs_times[i + 1]
    if nxt - cur > threshold:
        seq_end = i
        
        print(i, mdtm(obs_times[seq_start]), mdtm(obs_times[seq_end]), obs_times[seq_start], obs_times[seq_end], sep='\t')

        seq_start = seq_end + 1


Number of time entries: 95203

2534	1995-12-01 02:00:00.000003	1996-03-15 15:59:59.999997	728994.0833333334	729099.6666666666
7197	1998-03-19 18:00:00	1998-09-30 00:00:00	729833.75	730028.0
55348	1998-10-07 18:00:00	2004-04-05 00:00:00	730035.75	732042.0
86028	2004-05-07 23:00:00.000003	2007-11-07 06:00:00	732074.9583333334	733353.25
95201	2007-11-27 20:00:00.000003	2007-11-07 06:00:00	733373.8333333334	733353.25


In [21]:
print(mdtm(obs_times[seq_start]), mdtm(obs_times[-1]), sep='\t')
print(obs_times[seq_start], obs_times[-1])


2007-11-27 20:00:00.000003	2008-12-14 01:00:00.000007
733373.8333333334 733756.0416666667


### Analysis results
After doing analysis on this, see the file `data/obs_ww3_stats.ods`.

Here are the following allocations:
* Train -> idx_range(0:66643)
* Dev -> idx_range(66643:80923)
* Test -> idx_range(80923:-1)

Set	Datapoints	Start_idx	End_idx	Days
Train	66643	0	66643	2776.7916666667
Dev	14280	66643	80923	595
Test	14280	80923	-1	
![image.png](attachment:image.png)


In [12]:
len(obs_times)

95203

In [23]:
outdir = 'data/'
suffix = str(datetime.date.today()) + '.npz'
trn_filename = outdir + buoy_str + '_waves_TRAIN_' + suffix
dev_filename = outdir + buoy_str + '_waves_DEV_' + suffix
tst_filename = outdir + buoy_str + '_waves_TEST_' + suffix

trn_beg = 0
trn_end = 66643
dev_beg = trn_end
dev_end = 80923
tst_beg = dev_end
tst_end = -1

# save_single_set_to_npz(filename, obs_times, convert_to_third_order_tensor(buoy_tensor), convert_to_third_order_tensor(global_tensor))

buoy_tens3o = convert_to_third_order_tensor(buoy_tensor)
glob_tens3o = convert_to_third_order_tensor(global_tensor)

# save train
save_single_set_to_npz(trn_filename, 
                       obs_times[trn_beg:trn_end], 
                       buoy_tens3o[trn_beg:trn_end, :, :], 
                       glob_tens3o[trn_beg:trn_end, :, :])
print("... saved TRAIN: " + trn_filename)

# save dev
save_single_set_to_npz(dev_filename, 
                       obs_times[dev_beg:dev_end], 
                       buoy_tens3o[dev_beg:dev_end, :, :], 
                       glob_tens3o[dev_beg:dev_end, :, :])
print("... saved DEV: " + dev_filename)


# save test
save_single_set_to_npz(tst_filename, 
                       obs_times[tst_beg:tst_end],
                       buoy_tens3o[tst_beg:tst_end, :, :], 
                       glob_tens3o[tst_beg:tst_end, :, :])
print("... saved TEST: " + tst_filename)


# print("done... saved to " + filename)

... saved TRAIN: data/46218B_waves_TRAIN_2019-03-29.npz
... saved DEV: data/46218B_waves_DEV_2019-03-29.npz
... saved TEST: data/46218B_waves_TEST_2019-03-29.npz
