In [1]:
# deep learning
import torch
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
import torch.optim as optim

# basic
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
import os
import sys
import joblib
import matplotlib.pyplot as plt
import random
import datetime

In [2]:
scaler_filename = 'standard_scaler.joblib'
loaded_scaler = joblib.load(scaler_filename)

In [5]:
# modify this 
dataset = 'training' # or testing
num_years = 24 # will depend on training (24) or testing (6)
num_stations = 50 # number of stations in array to be made

# windowing
window_size = 60 # number of days in a single row
window_step = 20 # sliding window if you want nonoverlapping versus overlapping days
total_days = 366
window_starts = [_+1 for _ in range(0,(366//window_size)*window_size-window_size+1,window_step)]
num_windows = len(window_starts)

num_var = 85 # this is number of prechosen variables (must adjust if you drop out some columns/features)
target_file = f"{dataset}/Y-size{window_size}-step{window_step}-station{num_stations}.pt"
covariates_file = f"{dataset}/X-size{window_size}-step{window_step}-station{num_stations}.pt"

In [6]:
# initialize arrays
Y = torch.zeros(num_years * num_stations * num_windows, window_size)
X = torch.zeros(num_years * num_stations * num_windows, window_size, num_var)

# run through all datasets into tensors
ctr = 0
for _ in range(1, num_stations + 1):
    if (_) % 3 == 0:
        print(f'Station {_}')
    table = pd.read_csv(f'{dataset}/{_}.csv')    
    numeric_columns = table.columns.tolist()[3:]
    # datetime processing
    table['date'] = pd.to_datetime(table['date'])
    table['year'] = table['date'].dt.year
    table['dayofyear'] = table['date'].dt.dayofyear
    years = table.year.unique()
    # subset by year to get time series
    for yr in years:
        for wn in window_starts:
            dayofyear1 = wn
            dayofyear2 = wn + window_size
            subtable = table[(table.year == yr) & (table.dayofyear >= dayofyear1) & (table.dayofyear < dayofyear2)]
            yx = subtable[numeric_columns].to_numpy()
            scaled_yx = loaded_scaler.transform(yx)
            y = torch.tensor(scaled_yx[:,0])
            x = torch.tensor(scaled_yx[:,1:])
            Y[ctr] = y
            X[ctr] = x
            ctr += 1

    
    # # define variables
    # target = 'minmax_log_sedyld'
    # covariates = table.columns.tolist()[4:]
    # # datetime processing
    # table['date'] = pd.to_datetime(table['date'])
    # table['year'] = table['date'].dt.year
    # table['dayofyear'] = table['date'].dt.dayofyear
    # years = table.year.unique()
    # # subset by year to get time series
    # for yr in years:
    #     for wn in window_starts:
    #         dayofyear1 = wn
    #         dayofyear2 = wn + window_size
    #         subtable = table[(table.year == yr) & (table.dayofyear >= dayofyear1) & (table.dayofyear < dayofyear2)]
    #         y = torch.tensor(subtable[target].to_numpy())
    #         x = torch.tensor(subtable[covariates].to_numpy())
    #         Y[ctr] = y
    #         X[ctr] = x
    #         ctr += 1

# save the tensors
torch.save(Y, target_file)
torch.save(X, covariates_file)

Station 3
Station 6
Station 9
Station 12
Station 15
Station 18
Station 21
Station 24
Station 27
Station 30
Station 33
Station 36
Station 39
Station 42
Station 45
Station 48
Station 51
Station 54
Station 57
Station 60
Station 63
Station 66
Station 69
Station 72
Station 75
Station 78
Station 81
Station 84
Station 87
Station 90
Station 93
Station 96
Station 99
Station 102
Station 105
Station 108
Station 111
Station 114
Station 117
Station 120
Station 123
Station 126
Station 129
Station 132
Station 135
Station 138
Station 141
Station 144
Station 147
Station 150
Station 153
Station 156
Station 159
Station 162
Station 165
Station 168
Station 171
Station 174
Station 177
Station 180
Station 183
Station 186
Station 189
Station 192
Station 195
Station 198
Station 201
Station 204
Station 207
Station 210
Station 213
Station 216
Station 219
Station 222
Station 225
Station 228
Station 231
Station 234
Station 237
Station 240
Station 243
Station 246
Station 249
Station 252
Station 255
Station 258
Stat