In [None]:
# basic
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
import os
import sys
import joblib
import random
import datetime

In [None]:
scaler_filename = 'X_scaler.joblib'
X_loaded_scaler = joblib.load(scaler_filename)
scaler_filename = 'Y_scaler.joblib'
Y_loaded_scaler = joblib.load(scaler_filename)

In [None]:
num_stations = 50 # number of stations in array to be made

# windowing
window_size = 30 # number of days in a single row
window_step = 10 # sliding window if you want nonoverlapping versus overlapping days
total_days = 366
window_starts = [_+1 for _ in range(0,(366//window_size)*window_size-window_size+1,window_step)]
num_windows = len(window_starts)

num_var = 77 - 1 # this is number of prechosen variables (must adjust if you drop out some columns/features)

### Training data folded

In [None]:
# modify this
dataset = 'training' # or testing
target_file = f"{dataset}/Y-size{window_size}-step{window_step}-station{num_stations}"
covariates_file = f"{dataset}/X-size{window_size}-step{window_step}-station{num_stations}"
units_file = f"{dataset}/Z-size{window_size}-step{window_step}-station{num_stations}"

In [None]:
_ = 1
table = pd.read_csv(f'{dataset}/{_}.csv')    
numeric_columns = table.columns.tolist()[3:]
# datetime processing
table['date'] = pd.to_datetime(table['date'])
table['year'] = table['date'].dt.year
years = table['year'].unique()

In [None]:
kf = KFold(n_splits=8, shuffle=True, random_state=42)
for fold_train, fold_val in kf.split(years):
    fold_years = years[fold_val]
    num_years = len(fold_years)
    print(fold_years)

    # initialize arrays
    Y = np.zeros((num_years * num_stations * num_windows, window_size))
    X = np.zeros((num_years * num_stations * num_windows, window_size, num_var))
    Z = np.zeros((num_years * num_stations * num_windows, window_size))

    # run through all datasets into tensors
    ctr = 0
    for _ in range(1, num_stations + 1):
        if (_) % 10 == 0:
            print(f'Station {_}')
        table = pd.read_csv(f'{dataset}/{_}.csv')    
        numeric_columns = table.columns.tolist()[3:]
        # datetime processing
        table['date'] = pd.to_datetime(table['date'])
        table['year'] = table['date'].dt.year
        table = table[table.year.isin(fold_years)]
        table['dayofyear'] = table['date'].dt.dayofyear
        yrs = table.year.unique()
        # subset by year to get time series
        for yr in yrs:
            for wn in window_starts:
                dayofyear1 = wn
                dayofyear2 = wn + window_size
                subtable = table[(table.year == yr) & (table.dayofyear >= dayofyear1) & (table.dayofyear < dayofyear2)]
                x = subtable[numeric_columns[1:]].to_numpy()
                y = subtable[numeric_columns[0]].to_numpy().reshape(-1,1)
                scaled_X = X_loaded_scaler.transform(x)
                scaled_Y = Y_loaded_scaler.transform(y)
                Y[ctr] = scaled_Y.reshape(1,-1)
                X[ctr] = scaled_X
                Z[ctr] = _
                ctr += 1

    years_suffix = ''.join([f'_{yr}' for yr in fold_years])
    
    # save the arrays
    np.save(f'{target_file}{years_suffix}.npy', Y)
    np.save(f'{covariates_file}{years_suffix}.npy', X)
    np.save(f'{units_file}{years_suffix}.npy', Z)

    print()


### Testing data

In [None]:
# modify this
num_years = 6 # will depend on training (24) or testing (6)
dataset = 'testing' # or testing
target_file = f"{dataset}/Y-size{window_size}-step{window_step}-station{num_stations}.npy"
covariates_file = f"{dataset}/X-size{window_size}-step{window_step}-station{num_stations}.npy"
units_file = f"{dataset}/Z-size{window_size}-step{window_step}-station{num_stations}.npy"

In [None]:
# initialize arrays
Y = np.zeros((num_years * num_stations * num_windows, window_size))
X = np.zeros((num_years * num_stations * num_windows, window_size, num_var))
Z = np.zeros((num_years * num_stations * num_windows, window_size))

# run through all datasets into tensors
ctr = 0
for _ in range(1, num_stations + 1):
    if (_) % 10 == 0:
        print(f'Station {_}')
    table = pd.read_csv(f'{dataset}/{_}.csv')    
    numeric_columns = table.columns.tolist()[3:]
    # datetime processing
    table['date'] = pd.to_datetime(table['date'])
    table['year'] = table['date'].dt.year
    table['dayofyear'] = table['date'].dt.dayofyear
    years = table.year.unique()
    # subset by year to get time series
    for yr in years:
        for wn in window_starts:
            dayofyear1 = wn
            dayofyear2 = wn + window_size
            subtable = table[(table.year == yr) & (table.dayofyear >= dayofyear1) & (table.dayofyear < dayofyear2)]
            x = subtable[numeric_columns[1:]].to_numpy()
            y = subtable[numeric_columns[0]].to_numpy().reshape(-1,1)
            scaled_X = X_loaded_scaler.transform(x)
            scaled_Y = Y_loaded_scaler.transform(y)
            Y[ctr] = scaled_Y.reshape(1,-1)
            X[ctr] = scaled_X
            Z[ctr] = _
            ctr += 1

# save the arrays
np.save(target_file, Y)
np.save(covariates_file, X)
np.save(units_file, Z)