### Prepare data

In [1]:
# Needed packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from datetime import datetime,timedelta
from scipy import io,signal
import time

from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin

# Set paths to data
fobs = '../../offline_data/buoy_bulkwave_met_obs_qc.mat'
fpred = '../../offline_data/cfsr_buoy_met_pred.mat'

# Set path to save data folder
fol = '../../offline_data/mlwwcoast_prep_data'

In [2]:
# Collection of data-prepping functions
def matlab2datetime(matlab_datenum):
    day = datetime.fromordinal(int(matlab_datenum))
    dayfrac = timedelta(days=matlab_datenum%1) - timedelta(days = 366)
    return day + dayfrac

def load_data(fobs,fpred):
    # load matfiles
    obs = io.loadmat(fobs)
    pred = io.loadmat(fpred)
    
    # Estimate u,v for pred
    pred['u'] = pred['wndspd']*np.cos((90-pred['wnddir']+180)*np.pi/180.0)
    pred['v'] = pred['wndspd']*np.sin((90-pred['wnddir']+180)*np.pi/180.0)
    
    # Make datetime axes for each from datenum
    pred['t'] = [matlab2datetime(tt[0]) for tt in pred['time']] 
    obs['t'] = [matlab2datetime(tt[0]) for tt in obs['time']] 
    
    return (obs,pred)

def create_history_matrix(t,x,y,hr_back,hr_forward):

    # Generate prior wind speeds, up to X hours back in time using np.roll() like circshift in matlab
    trim = np.max([hr_back,hr_forward])

    # Time length
    Nt = len(t)

    # Make empty frames for history data, and fill with data
    X = np.zeros((Nt,hr_back))
    for ii in range(hr_back):    
        X[:,ii] = np.roll(x,ii)    
    
    # Make empty frames and fill with future data, not including the current time stamp
    Y = np.zeros((Nt,hr_forward-1))
    for ii in range(1,hr_forward):
        Y[:,ii-1] = np.roll(x,-ii)
    
    # Create numerical weather prediction at forward hours
    P = np.zeros((Nt,hr_forward-1))
    for ii in range(1,hr_forward):
        P[:,ii-1] = np.roll(y,-ii)

    # Trim by setting nan
    X[:trim,:] = np.nan
    X[-trim:,:] = np.nan
    P[:trim,:] = np.nan
    P[-trim:,:] = np.nan
    Y[:trim,:] = np.nan
    Y[-trim:,:] = np.nan
    t[:trim] = np.nan
    t[-trim:] = np.nan

    # Remove time-steps with NaN (including many observation times)
    msk1 = np.isnan(X)
    msk1 = msk1.any(axis=1)
    msk2 = np.isnan(P)
    msk2 = msk2.any(axis=1)
    msk3 = np.isnan(Y)
    msk3 = msk3.any(axis=1)        
    msk = msk1 | msk2 | msk3 

    X = X[~msk,:]
    P = P[~msk,:]
    Y = Y[~msk,:]
    t = t[~msk]
    
    return (X, Y, P, t, msk)

#----- Make yearly mean variable -----#
def make_yr_avg(time,x):    
    # Calc day of year
    doy = np.array([i.timetuple().tm_yday for i in time])-1

    # Calc avg for each day (include leap year)
    xa = np.zeros(366)
    for dd in range(366):
        msk = (doy == dd)
        xa[dd] = np.nanmean(x[msk])

    # Smooth this daily average wind vector
    # In future should wrap this before smoothing and then unwrap

    from scipy import signal
    xas = signal.savgol_filter(xa, 25*3, 2)

    plot_on = 0
    if plot_on == 1:
        fig, ax1 = plt.subplots(1,2,figsize=(10,5))
        ax1.plot(uavg,label='Daily Avg')
        ax1.plot(uavgs,label='Smoothed')
        plt.ylabel('U [m/s]')
        ax1.legend()
        plt.xlabel('Day of Year')

    # Append avg to time series
    xa = xas[doy]
    
    return xa

def test_regression(HU,fu_o):
    # Train/test
    x_tr, x_te, y_tr, y_te = train_test_split(HU, fu_o, test_size = 0.25, random_state = 42)
    
    # Creating an object of LinearRegression class
    LR = LinearRegression()
    
    # fitting the training data
    LR.fit(x_tr,y_tr)
    
    # Make predictions
    y_p = np.squeeze(LR.predict(x_te))
    y = np.squeeze(y_te)

    # Evaluate error
    r2 = r2_score(y.T,y_p)
    mae = np.mean(np.abs(y.T-y_p))
    
    # Pass error back
    return (r2,mae)


In [3]:
# Load up data
(obs,pred) = load_data(fobs,fpred)

In [4]:
#---- Make Input and Output Matrices, X, y-----#
# xo -> weather observation
# xf -> weather forecast
#
# Saved variables
# X -> History Matrix (time,hr-back) where hr-back = 0,1,2,...
# Y -> Labels, (time,hr-forward) where hr_forward = 1,2,3... (pred we are trying to make)
# F -> Weather Forecast, (time,hr-forward)
# t -> time (time,)
# xa -> daily average, smoothed (time,1)
# myvar -> variable(u,v)

# Select how many hours to go back in time
hr_back = 24

# Select forecast hour to predict
hr_forward = 24

# Select with variable to predict
myvar = 'v' 

# Select buoy to use
for bb in range(21):

    # Interp onto pred axis
    t = np.squeeze(pred['time'])
    xf = np.squeeze(pred[myvar][:,bb])
    to = np.squeeze(obs['time'])
    xo = np.squeeze(obs[myvar][bb,:])
    xo = np.interp(t,to,xo)

    # Generate history matrix and trimmed/cleaned vectors
    (X, Y, F, t, msk) = create_history_matrix(t,xo,xf,hr_back,hr_forward)

    # Create yearly average
    time = [matlab2datetime(tt) for tt in t]
    xa = make_yr_avg(time,X[:,0])
    xa = np.expand_dims(xa,1) # Make 2D for concat

    fsave = '{:s}/input_{:s}_buoy{:d}_back{:d}_for{:d}'.format(fol,myvar,bb,hr_back,hr_forward)
    np.savez(fsave, X=X, Y=Y, F=F, t=t, avg=xa)