In [1]:
import xarray as xr
import ocf_blosc2
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
from pytorch_forecasting import TimeSeriesDataSet, TemporalFusionTransformer, Baseline
from pytorch_forecasting.data import GroupNormalizer
from pytorch_forecasting.metrics import QuantileLoss
from pytorch_lightning import Trainer


In [2]:
forecast_data = pd.read_csv("result_data/final_data_tft2.csv")
forecast_data = forecast_data.drop(columns=['Unnamed: 0'])
forecast_data = forecast_data.sort_values(['ss_id', 'pv_datetime'])
forecast_data['ss_id'] = forecast_data['ss_id'].astype(str)
forecast_data['pv_datetime'] = pd.to_datetime(forecast_data['pv_datetime'])
forecast_data['date'] = forecast_data['pv_datetime'].dt.date
forecast_data['day_of_week'] = forecast_data['pv_datetime'].dt.dayofweek
forecast_data['month'] = forecast_data['pv_datetime'].dt.month


In [3]:
forecast_data.head()

Unnamed: 0,ss_id,init_time,step,generation,pv_datetime,pv_hour,hour,lat,long,tilt,...,sr,t2m,tcc,u10,u100,v10,v100,date,day_of_week,month
6293,3147,2019-03-17 00:00:00,0 days 18:00:00,1.069375e-10,2019-03-16 18:00:00,18,1,51.12,-0.02,40.0,...,21856324.0,279.14966,0.121735,5.416376,9.102982,-0.231537,-0.851471,2019-03-16,5,3
6294,3147,2019-03-17 00:00:00,0 days 19:00:00,0.0,2019-03-16 19:00:00,19,2,51.12,-0.02,40.0,...,21863492.0,277.78784,0.001434,4.447694,7.998607,0.227203,-0.140961,2019-03-16,5,3
6295,3147,2019-03-17 00:00:00,0 days 20:00:00,0.0,2019-03-16 20:00:00,20,3,51.12,-0.02,40.0,...,21863492.0,276.7849,0.441833,4.002581,7.498468,0.140137,-0.432358,2019-03-16,5,3
6296,3147,2019-03-17 00:00:00,0 days 21:00:00,0.0,2019-03-16 21:00:00,21,4,51.12,-0.02,40.0,...,21863492.0,276.85815,0.721863,3.675459,6.80986,-0.478973,-1.726532,2019-03-16,5,3
6297,3147,2019-03-17 00:00:00,0 days 22:00:00,0.0,2019-03-16 22:00:00,22,5,51.12,-0.02,40.0,...,21863492.0,277.41187,0.018034,3.383099,6.230782,-1.177109,-2.98085,2019-03-16,5,3


In [4]:
forecast_data.columns

Index(['ss_id', 'init_time', 'step', 'generation', 'pv_datetime', 'pv_hour',
       'hour', 'lat', 'long', 'tilt', 'orientation', 'dlwrf', 'dswrf', 'duvrs',
       'hcc', 'lcc', 'mcc', 'sde', 'sr', 't2m', 'tcc', 'u10', 'u100', 'v10',
       'v100', 'date', 'day_of_week', 'month'],
      dtype='object')

In [6]:
target_variable = 'generation'
static_features = ['ss_id', 'lat', 'long', 'tilt', 'orientation']
known_future_inputs = [
    'dlwrf', 'dswrf', 'duvrs', 'hcc', 'lcc', 'mcc', 'sde', 'sr', 
    't2m', 'tcc', 'u10', 'u100', 'v10', 'v100', 'day_of_week', 'month', 'hour'
]

required_columns = static_features + known_future_inputs + [target_variable, 'pv_datetime', 'date']

forecast_data = forecast_data[required_columns]

forecast_data = forecast_data.fillna(method='bfill').fillna(method='ffill')

forecast_data['time_idx'] = forecast_data['pv_datetime'].astype('int64') // 10**9  # Convert to seconds since epoch


In [7]:
print(forecast_data.columns)

Index(['ss_id', 'lat', 'long', 'tilt', 'orientation', 'dlwrf', 'dswrf',
       'duvrs', 'hcc', 'lcc', 'mcc', 'sde', 'sr', 't2m', 'tcc', 'u10', 'u100',
       'v10', 'v100', 'day_of_week', 'month', 'hour', 'generation',
       'pv_datetime', 'date', 'time_idx'],
      dtype='object')


In [8]:
forecast_data.head()

Unnamed: 0,ss_id,lat,long,tilt,orientation,dlwrf,dswrf,duvrs,hcc,lcc,...,u100,v10,v100,day_of_week,month,hour,generation,pv_datetime,date,time_idx
6293,3147,51.12,-0.02,40.0,180.0,17286896.0,14154521.0,1444081.8,0.0,0.103668,...,9.102982,-0.231537,-0.851471,5,3,1,1.069375e-10,2019-03-16 18:00:00,2019-03-16,1552759200
6294,3147,51.12,-0.02,40.0,180.0,18194266.0,14154777.0,1444113.8,0.0,0.001434,...,7.998607,0.227203,-0.140961,5,3,2,0.0,2019-03-16 19:00:00,2019-03-16,1552762800
6295,3147,51.12,-0.02,40.0,180.0,19073552.0,14154777.0,1444113.8,0.0,0.441833,...,7.498468,0.140137,-0.432358,5,3,3,0.0,2019-03-16 20:00:00,2019-03-16,1552766400
6296,3147,51.12,-0.02,40.0,180.0,20026864.0,14154777.0,1444113.8,0.0,0.721863,...,6.80986,-0.478973,-1.726532,5,3,4,0.0,2019-03-16 21:00:00,2019-03-16,1552770000
6297,3147,51.12,-0.02,40.0,180.0,21029888.0,14154777.0,1444113.8,0.0,0.018005,...,6.230782,-1.177109,-2.98085,5,3,5,0.0,2019-03-16 22:00:00,2019-03-16,1552773600


In [9]:
# forecast_data.to_csv("test2.csv")

In [10]:
max_encoder_length = 36
max_prediction_length = 36

In [11]:
# keep it continous
# let's just do 36 hours
# check the minimum, 36, if not remove it. 
# init time 2 adjacent values has 12 hours gap. 
# use only 36 steps
# even if nwp data is missing, it is fine to give nans. model will learn
# start with 36 hours forecast
# select a lot from pv, get rid of nans, and get it for training for next steps

# pick one specific datetime , make a graph, one forecast batch, look what clouds are doing, what generation is like, 

In [12]:
training_cutoff = forecast_data["pv_datetime"].max() - pd.Timedelta(hours=max_prediction_length)


In [13]:
training_cutoff

Timestamp('2021-10-25 23:00:00')

In [14]:
training_data = forecast_data[forecast_data["pv_datetime"] <= training_cutoff]
num_training_rows = len(training_data)
print(f"Number of rows in the training set: {num_training_rows}")
print(forecast_data.shape)

Number of rows in the training set: 9291
(10149, 26)


In [15]:
forecast_data['ss_id'] = forecast_data['ss_id'].astype(str)
forecast_data['day_of_week'] = forecast_data['day_of_week'].astype(str)
forecast_data['month'] = forecast_data['month'].astype(str)
forecast_data['hour'] = forecast_data['hour'].astype(str)


In [16]:
print("Total rows in forecast_data:", len(forecast_data))
print("Rows before training_cutoff:", len(forecast_data[forecast_data.pv_datetime <= training_cutoff]))
print("Training cutoff datetime:", training_cutoff)


Total rows in forecast_data: 10149
Rows before training_cutoff: 9291
Training cutoff datetime: 2021-10-25 23:00:00


In [17]:
# Function to check if sequence is continuous and of required length
def is_continuous_block(df, block_size=48):
    if len(df) != block_size:
        return False
    time_diffs = df['pv_datetime'].diff().dropna()
    return all(time_diffs == pd.Timedelta(hours=1))

# Filter sequences to ensure continuity and correct length
filtered_data = []
for ss_id, group in forecast_data.groupby('ss_id'):
    for start_idx in range(0, len(group) - max_encoder_length + 1):
        block = group.iloc[start_idx:start_idx + max_encoder_length]
        if is_continuous_block(block, max_encoder_length):
            filtered_data.append(block)

filtered_data = pd.concat(filtered_data)
print("Rows after filtering short and discontinuous sequences:", len(filtered_data))


Rows after filtering short and discontinuous sequences: 6192


In [22]:
unique_longitudes = filtered_data['long'].unique()
print("Unique longitudes in filtered data:", unique_longitudes)
print(len(unique_longitudes))

Unique longitudes in filtered data: [-1.36 -1.3  -2.4  -3.   -1.59 -1.13 -3.96 -4.03 -1.82 -2.76 -4.15 -2.8
 -2.96 -0.31 -0.23 -1.8  -0.02 -3.4  -2.56  0.62 -3.89  1.32 -0.17 -2.44
 -1.46 -0.19 -4.77  0.9  -5.56 -4.59 -1.11 -4.09 -2.52 -3.04 -4.08]
35


In [19]:
training = TimeSeriesDataSet(
    filtered_data[lambda x: x.pv_datetime <= training_cutoff],
    time_idx="time_idx",
    target="generation",
    group_ids=["ss_id"],  # Grouping by ss_id to identify different PV sites
    min_encoder_length=max_encoder_length // 2,
    max_encoder_length=max_encoder_length,
    min_prediction_length=1,
    max_prediction_length=max_prediction_length,
    static_categoricals=["ss_id"],  # ss_id is static for each site
    static_reals=["lat", "long", "tilt", "orientation"],
    time_varying_known_categoricals=["day_of_week", "month", "hour"],
    time_varying_known_reals=["dlwrf", "dswrf", "duvrs", "hcc", "lcc", "mcc", "sde", "sr", "t2m", "tcc", "u10", "u100", "v10", "v100"],
    time_varying_unknown_reals=["generation"],
    target_normalizer=GroupNormalizer(groups=["ss_id"], transformation="softplus"),
    add_relative_time_idx=True,
    add_target_scales=True,
    add_encoder_length=True,
    allow_missing_timesteps=True,  # Allow missing timesteps
)

AssertionError: filters should not remove entries all entries - check encoder/decoder lengths and lags