In [1]:
import xarray as xr
import ocf_blosc2
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
import numpy as np
from tqdm import tqdm


In [2]:
nwp_data = xr.open_dataset("../../../mnt/disks/gcp_data/nwp/ecmwf/UK_v2.zarr")
nwp_data

In [3]:
meta_data = pd.read_csv("data_files/metadata.csv")
meta_data.head()

Unnamed: 0,ss_id,latitude_rounded,longitude_rounded,llsoacd,orientation,tilt,kwp,operational_at
0,2405,53.53,-1.63,E01007430,180.0,35.0,3.36,2010-11-18
1,2406,54.88,-1.38,E01008780,315.0,30.0,1.89,2010-12-03
2,2407,54.88,-1.38,E01008780,225.0,30.0,1.89,2010-12-03
3,2408,54.88,-1.38,E01008780,225.0,30.0,1.89,2010-12-03
4,2409,54.88,-1.38,E01008780,225.0,30.0,1.89,2010-12-03


In [4]:
pv_data = xr.open_dataset("data_files/pv.netcdf", engine='h5netcdf')
pv_data

In [5]:
skip_ss_ids = ['8440', '16718', '8715', '17073', '9108', '9172', '10167', '10205', '10207', '10278', '26778', '26819', '10437', '10466', '26915', '10547', '26939', '26971', '10685', '10689', '2638', '2661', '2754', '2777', '2783', '2786', '2793', '2812', '2829', '2830', '2867', '2883', '2904', '2923', '2947', '2976', '2989', '2999', '3003', '3086', '3118', '3123', '3125', '3264', '3266', '3271', '3313', '3334', '3470', '3502', '11769', '11828', '11962', '3772', '11983', '3866', '3869', '4056', '4067', '4116', '4117', '4124', '4323', '4420', '20857', '4754', '13387', '13415', '5755', '5861', '5990', '6026', '6038', '6054', '14455', '6383', '6430', '6440', '6478', '6488', '6541', '6548', '6560', '14786', '6630', '6804', '6849', '6868', '6870', '6878', '6901', '6971', '7055', '7111', '7124', '7132', '7143', '7154', '7155', '7156', '7158', '7201', '7237', '7268', '7289', '7294', '7311', '7329', '7339', '7379', '7392', '7479', '7638', '7695', '7772', '15967', '7890', '16215', '7830']

In [6]:
# select hourly pv data all of it, except the skip ids. 
# select the rest and plot graphs of years with generation.

print(len(skip_ss_ids))
hourly_pv_data = pv_data.sel(datetime=pv_data['datetime'].dt.minute == 0)

valid_ss_ids_data = [var for var in hourly_pv_data.data_vars if var not in skip_ss_ids]
pv_sites_id = np.random.choice(valid_ss_ids_data, 500, replace=False)
filtered_hourly_pv_data = hourly_pv_data[pv_sites_id]
filtered_hourly_pv_data

119


In [7]:
def get_36_hour_range(start_datetime, hours=36):
    end_datetime = start_datetime + pd.Timedelta(hours=hours - 1, minutes=59)
    return start_datetime, end_datetime

def select_non_overlapping_datetimes(datetimes, num_selections, min_gap_hours):
    selected_datetimes = []
    available_datetimes = list(datetimes)

    for _ in range(num_selections):
        if not available_datetimes:
            break
        random_datetime = np.random.choice(available_datetimes)
        selected_datetimes.append(random_datetime)
        
        # Remove datetimes within the min_gap_hours range
        available_datetimes = [dt for dt in available_datetimes if dt > random_datetime + pd.Timedelta(hours=min_gap_hours)]

    return selected_datetimes

In [8]:
datetimes = pd.to_datetime(filtered_hourly_pv_data['datetime'].values)
data_dict = {'ss_id': [], 'pv_datetime': [], 'generation' : [], 'horizon':[]}

batch_size = 36
num_selections = 5000
min_gap_hours = 36


for ss_id in pv_sites_id:
    selected_datetimes = select_non_overlapping_datetimes(datetimes, num_selections, min_gap_hours)
    
    for start_datetime in selected_datetimes:
        start, end = get_36_hour_range(start_datetime, hours=batch_size)
        selected_data = hourly_pv_data.sel(datetime=slice(start, end))

        if len(selected_data['datetime']) < batch_size or selected_data[ss_id].isnull().any():
            continue
        
        hour_counter = 1  # Initialize hour_counter for each new batch
        batch_data = {'ss_id': [], 'pv_datetime': [], 'generation': [], 'horizon': []}
        
        for dt, power in zip(selected_data['datetime'].values, selected_data[ss_id].values):
            batch_data['ss_id'].append(int(ss_id))
            batch_data['pv_datetime'].append(dt)
            batch_data['generation'].append(power)
            batch_data['horizon'].append(hour_counter)
            hour_counter += 1
        
        if hour_counter - 1 == batch_size:
            for key in data_dict.keys():
                data_dict[key].extend(batch_data[key])
        else:
            pass

In [9]:
pv_df = pd.DataFrame(data_dict)
print(pv_df.shape)
pv_df = pv_df.dropna(subset={'generation'})

pv_df
print(pv_df.shape)


(51948, 4)
(51948, 4)


In [89]:
# pv_df.to_csv("tr2.csv")

In [90]:
# pv_df = pd.read_csv("tr2.csv")
# pv_df

In [10]:
pv_sites_id = [int(id) for id in pv_sites_id]
pv_site_dict = {'ss_id':[], "lat":[], "long": [], 'tilt':[], 'orientation':[], 'kwp':[]}

for id in pv_sites_id:
    row = meta_data[meta_data['ss_id'] == id]
    if not row.empty:
        pv_site_dict['ss_id'].append(id)
        pv_site_dict['lat'].append(row['latitude_rounded'].values[0])
        pv_site_dict['long'].append(row['longitude_rounded'].values[0])
        pv_site_dict['tilt'].append(row['tilt'].values[0])
        pv_site_dict['orientation'].append(row['orientation'].values[0])
        pv_site_dict['kwp'].append(row['kwp'].values[0])
    else:
        print('row empty')
        

meta_site_df = pd.DataFrame.from_dict(pv_site_dict)
meta_site_df

row empty


Unnamed: 0,ss_id,lat,long,tilt,orientation,kwp
0,6672,53.15,-1.16,24.0,180.0,4.000
1,26923,51.75,-2.69,40.0,220.0,3.960
2,27029,53.40,-2.91,32.0,250.0,2.000
3,6504,50.34,-4.80,30.0,225.0,2.880
4,7905,54.08,-0.21,35.0,45.0,2.500
...,...,...,...,...,...,...
494,5177,52.91,1.25,30.0,180.0,3.825
495,14861,55.98,-4.20,35.0,180.0,2.000
496,7930,55.77,-4.16,30.0,216.0,3.000
497,9760,55.82,-3.94,30.0,270.0,3.000


In [11]:
combined_df = pd.merge(pv_df, meta_site_df, on='ss_id', how='inner')
combined_df

Unnamed: 0,ss_id,pv_datetime,generation,horizon,lat,long,tilt,orientation,kwp
0,6672,2021-08-02 09:00:00,1047.730835,1,53.15,-1.16,24.0,180.0,4.0
1,6672,2021-08-02 10:00:00,752.909973,2,53.15,-1.16,24.0,180.0,4.0
2,6672,2021-08-02 11:00:00,1231.332031,3,53.15,-1.16,24.0,180.0,4.0
3,6672,2021-08-02 12:00:00,2885.112061,4,53.15,-1.16,24.0,180.0,4.0
4,6672,2021-08-02 13:00:00,2011.500000,5,53.15,-1.16,24.0,180.0,4.0
...,...,...,...,...,...,...,...,...,...
51943,2835,2021-10-22 04:00:00,0.000000,32,52.56,-1.14,30.0,100.0,2.0
51944,2835,2021-10-22 05:00:00,0.000000,33,52.56,-1.14,30.0,100.0,2.0
51945,2835,2021-10-22 06:00:00,0.000000,34,52.56,-1.14,30.0,100.0,2.0
51946,2835,2021-10-22 07:00:00,4.600524,35,52.56,-1.14,30.0,100.0,2.0


In [16]:
combined_df['pv_datetime'] = pd.to_datetime(combined_df['pv_datetime'])
combined_df['pv_date'] = combined_df['pv_datetime'].dt.date
combined_df['pv_hour'] = combined_df['pv_datetime'].dt.hour
combined_df

Unnamed: 0,ss_id,pv_datetime,generation,horizon,lat,long,tilt,orientation,kwp,pv_date,pv_hour
0,6672,2021-08-02 09:00:00,1047.730835,1,53.15,-1.16,24.0,180.0,4.0,2021-08-02,9
1,6672,2021-08-02 10:00:00,752.909973,2,53.15,-1.16,24.0,180.0,4.0,2021-08-02,10
2,6672,2021-08-02 11:00:00,1231.332031,3,53.15,-1.16,24.0,180.0,4.0,2021-08-02,11
3,6672,2021-08-02 12:00:00,2885.112061,4,53.15,-1.16,24.0,180.0,4.0,2021-08-02,12
4,6672,2021-08-02 13:00:00,2011.500000,5,53.15,-1.16,24.0,180.0,4.0,2021-08-02,13
...,...,...,...,...,...,...,...,...,...,...,...
51943,2835,2021-10-22 04:00:00,0.000000,32,52.56,-1.14,30.0,100.0,2.0,2021-10-22,4
51944,2835,2021-10-22 05:00:00,0.000000,33,52.56,-1.14,30.0,100.0,2.0,2021-10-22,5
51945,2835,2021-10-22 06:00:00,0.000000,34,52.56,-1.14,30.0,100.0,2.0,2021-10-22,6
51946,2835,2021-10-22 07:00:00,4.600524,35,52.56,-1.14,30.0,100.0,2.0,2021-10-22,7


In [17]:
combined_df.to_csv("result_data/combined_df.csv")

In [18]:
combined_df['pv_datetime'] = pd.to_datetime(combined_df['pv_datetime'])
combined_df

Unnamed: 0,ss_id,pv_datetime,generation,horizon,lat,long,tilt,orientation,kwp,pv_date,pv_hour
0,6672,2021-08-02 09:00:00,1047.730835,1,53.15,-1.16,24.0,180.0,4.0,2021-08-02,9
1,6672,2021-08-02 10:00:00,752.909973,2,53.15,-1.16,24.0,180.0,4.0,2021-08-02,10
2,6672,2021-08-02 11:00:00,1231.332031,3,53.15,-1.16,24.0,180.0,4.0,2021-08-02,11
3,6672,2021-08-02 12:00:00,2885.112061,4,53.15,-1.16,24.0,180.0,4.0,2021-08-02,12
4,6672,2021-08-02 13:00:00,2011.500000,5,53.15,-1.16,24.0,180.0,4.0,2021-08-02,13
...,...,...,...,...,...,...,...,...,...,...,...
51943,2835,2021-10-22 04:00:00,0.000000,32,52.56,-1.14,30.0,100.0,2.0,2021-10-22,4
51944,2835,2021-10-22 05:00:00,0.000000,33,52.56,-1.14,30.0,100.0,2.0,2021-10-22,5
51945,2835,2021-10-22 06:00:00,0.000000,34,52.56,-1.14,30.0,100.0,2.0,2021-10-22,6
51946,2835,2021-10-22 07:00:00,4.600524,35,52.56,-1.14,30.0,100.0,2.0,2021-10-22,7


In [19]:
results = []
batch_size = 36

In [20]:
counter = 0
for i in tqdm(range(0, len(combined_df), batch_size), desc="Processing batches"):
    batch = combined_df.iloc[i:i + batch_size]
    
    if len(batch) < batch_size:
        continue  # Skip incomplete batches

    # Get the initial pv_datetime and corresponding init_time in nwp dataset
    initial_time = batch.iloc[0]['pv_datetime']
    
    # Get the latitude and longitude from the first row in the batch
    lat = batch.iloc[0]['lat']
    lon = batch.iloc[0]['long']
    
    # Select the nearest latitude and longitude in the NWP dataset first
    nwp_sel = nwp_data.sel(latitude=lat, method="nearest").sel(longitude=lon, method="nearest")
    
    # Select the nearest init_time
    init_time_sel = nwp_sel.sel(init_time=initial_time, method="ffill")
    
    if init_time_sel.init_time.size == 0:
        continue  # No matching init_time found

    # matching_init_time = init_time_sel.init_time.values[0]

    # Retrieve the 36 steps of forecast data starting from the matched init_time
    data_sel = init_time_sel.sel(step=slice(pd.Timedelta(hours=0), pd.Timedelta(hours=35)))

    # Convert to DataFrame and pivot
    data_df = data_sel.to_dataframe().reset_index()
    pivot_df = data_df.pivot_table(index=['init_time', 'step'], columns='variable', values='ECMWF_UK').reset_index()

    if len(pivot_df) < batch_size:
        continue

    for j in range(batch_size):
        pivot_df.loc[j, 'ss_id'] = batch.iloc[j]['ss_id']
        pivot_df.loc[j, 'pv_datetime'] = batch.iloc[j]['pv_datetime']
        pivot_df.loc[j, 'generation'] = batch.iloc[j]['generation']
        pivot_df.loc[j, 'horizon'] = batch.iloc[j]['horizon']
        pivot_df.loc[j, 'lat'] = lat
        pivot_df.loc[j, 'long'] = lon
        pivot_df.loc[j, 'tilt'] = batch.iloc[j]['tilt']
        pivot_df.loc[j, 'orientation'] = batch.iloc[j]['orientation']
        pivot_df.loc[j, 'kwp'] = batch.iloc[j]['kwp']
        pivot_df.loc[j, 'pv_hour'] = batch.iloc[j]['pv_hour']

    # Append to the results list
    results.append(pivot_df)
    counter += 1


Processing batches:   0%|          | 0/1443 [00:00<?, ?it/s]

Processing batches: 100%|██████████| 1443/1443 [04:32<00:00,  5.29it/s]


In [21]:
print(len(results))
print(counter)

1443
1443


In [22]:
final_df = pd.concat(results, ignore_index=True)
final_df

variable,init_time,step,dlwrf,dswrf,duvrs,hcc,lcc,mcc,sde,sr,...,ss_id,pv_datetime,generation,horizon,lat,long,tilt,orientation,kwp,pv_hour
0,2021-08-02 00:00:00,0 days 00:00:00,0.00,0.0,0.0,0.000000,0.921265,0.810272,0.0,0.0,...,6672.0,2021-08-02 09:00:00,1047.730835,1.0,53.15,-1.16,24.0,180.0,4.0,9.0
1,2021-08-02 00:00:00,0 days 01:00:00,1257754.25,0.0,0.0,0.000000,0.937531,0.734161,0.0,0.0,...,6672.0,2021-08-02 10:00:00,752.909973,2.0,53.15,-1.16,24.0,180.0,4.0,10.0
2,2021-08-02 00:00:00,0 days 02:00:00,2539024.00,0.0,0.0,0.000000,0.931274,0.697449,0.0,0.0,...,6672.0,2021-08-02 11:00:00,1231.332031,3.0,53.15,-1.16,24.0,180.0,4.0,11.0
3,2021-08-02 00:00:00,0 days 03:00:00,3788612.00,0.0,0.0,0.000000,0.900940,0.291351,0.0,0.0,...,6672.0,2021-08-02 12:00:00,2885.112061,4.0,53.15,-1.16,24.0,180.0,4.0,12.0
4,2021-08-02 00:00:00,0 days 04:00:00,5017871.00,0.0,0.0,0.000000,0.797485,0.200592,0.0,0.0,...,6672.0,2021-08-02 13:00:00,2011.500000,5.0,53.15,-1.16,24.0,180.0,4.0,13.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51943,2021-10-20 12:00:00,1 days 07:00:00,32783312.00,13364117.0,1318937.5,0.215820,0.028229,0.000000,0.0,30921886.0,...,2835.0,2021-10-22 04:00:00,0.000000,32.0,52.56,-1.14,30.0,100.0,2.0,4.0
51944,2021-10-20 12:00:00,1 days 08:00:00,33716464.00,13364117.0,1318937.5,0.417603,0.006805,0.000519,0.0,30921886.0,...,2835.0,2021-10-22 05:00:00,0.000000,33.0,52.56,-1.14,30.0,100.0,2.0,5.0
51945,2021-10-20 12:00:00,1 days 09:00:00,34658032.00,13364117.0,1318937.5,0.173187,0.026428,0.129486,0.0,30921886.0,...,2835.0,2021-10-22 06:00:00,0.000000,34.0,52.56,-1.14,30.0,100.0,2.0,6.0
51946,2021-10-20 12:00:00,1 days 10:00:00,35594240.00,13364143.0,1318937.5,0.000092,0.026611,0.035492,0.0,30921886.0,...,2835.0,2021-10-22 07:00:00,4.600524,35.0,52.56,-1.14,30.0,100.0,2.0,7.0


In [23]:
final_df.columns

Index(['init_time', 'step', 'dlwrf', 'dswrf', 'duvrs', 'hcc', 'lcc', 'mcc',
       'sde', 'sr', 't2m', 'tcc', 'u10', 'u100', 'v10', 'v100', 'ss_id',
       'pv_datetime', 'generation', 'horizon', 'lat', 'long', 'tilt',
       'orientation', 'kwp', 'pv_hour'],
      dtype='object', name='variable')

In [24]:
# Define the cumulative variables
cumulative_vars = ['dlwrf', 'dswrf', 'duvrs', 'sr']

# Function to convert cumulative to instantaneous
def cumulative_to_instantaneous(group):
    for var in cumulative_vars:
        group[f'{var}'] = group[var].diff().fillna(group[var])
    return group

# Group by 'ss_id' and 'init_time', then apply the conversion
final_df = final_df.groupby(['ss_id', 'init_time']).apply(cumulative_to_instantaneous).reset_index(drop=True)

In [28]:
# final_df['normalize_generation'] = final_df['generation']/final_df['kwp']
# final_df = final_df.rename(columns={'kwp': 'capacity'}, inplace=True)
final_df.columns
final_df.shape

(51948, 27)

In [29]:
desired_order = ['ss_id', 'init_time', 'step', 'pv_datetime', 'pv_hour', 'horizon', 'generation', 'capacity', 'normalize_generation', 'lat', 'long', 'tilt', 'orientation', 'dlwrf', 'dswrf', 'duvrs', 'hcc', 'lcc', 'mcc', 'sde', 'sr', 't2m', 'tcc', 'u10', 'u100', 'v10', 'v100']
final_df = final_df[desired_order]

In [30]:
final_df

variable,ss_id,init_time,step,pv_datetime,pv_hour,horizon,generation,capacity,normalize_generation,lat,...,lcc,mcc,sde,sr,t2m,tcc,u10,u100,v10,v100
0,2631.0,2021-10-19 12:00:00,0 days 00:00:00,2021-10-19 18:00:00,18.0,1.0,0.000000,4.0,0.000000,51.81,...,1.000000,0.943909,0.0,0.0,290.234863,1.000000,3.040367,5.269863,6.780065,10.418222
1,2631.0,2021-10-19 12:00:00,0 days 01:00:00,2021-10-19 19:00:00,19.0,2.0,0.000000,4.0,0.000000,51.81,...,1.000000,0.935089,0.0,2240.0,290.452881,1.000000,3.167645,5.255136,7.176521,10.833017
2,2631.0,2021-10-19 12:00:00,0 days 02:00:00,2021-10-19 20:00:00,20.0,3.0,0.000000,4.0,0.000000,51.81,...,1.000000,0.999725,0.0,4928.0,290.402832,1.000000,2.951848,4.813256,7.045044,10.617472
3,2631.0,2021-10-19 12:00:00,0 days 03:00:00,2021-10-19 21:00:00,21.0,4.0,0.000000,4.0,0.000000,51.81,...,0.999969,0.967377,0.0,2560.0,289.804199,1.000000,2.746724,4.556017,7.446497,11.571623
4,2631.0,2021-10-19 12:00:00,0 days 04:00:00,2021-10-19 22:00:00,22.0,5.0,0.000000,4.0,0.000000,51.81,...,1.000000,0.993530,0.0,4096.0,289.785156,1.000000,3.175217,5.447397,7.159049,11.096762
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51943,27064.0,2021-10-11 00:00:00,1 days 07:00:00,2021-10-12 11:00:00,11.0,32.0,908.971191,2.0,454.485596,51.47,...,0.069977,0.841431,0.0,181760.0,281.049805,0.946198,2.619058,4.780936,-1.124188,-3.762029
51944,27064.0,2021-10-11 00:00:00,1 days 08:00:00,2021-10-12 12:00:00,12.0,33.0,1400.040039,2.0,700.020020,51.47,...,0.089294,0.284546,0.0,289280.0,282.816650,0.891418,3.065713,4.840455,-1.597640,-3.979417
51945,27064.0,2021-10-11 00:00:00,1 days 09:00:00,2021-10-12 13:00:00,13.0,34.0,254.460007,2.0,127.230003,51.47,...,0.084595,0.744873,0.0,920064.0,284.671143,0.904755,3.102398,4.357941,-2.269724,-3.671457
51946,27064.0,2021-10-11 00:00:00,1 days 10:00:00,2021-10-12 14:00:00,14.0,35.0,266.737213,2.0,133.368607,51.47,...,0.073120,0.917603,0.0,669184.0,285.735107,0.933502,3.113303,4.248116,-3.022680,-4.408169


In [31]:
final_df.columns

Index(['ss_id', 'init_time', 'step', 'pv_datetime', 'pv_hour', 'horizon',
       'generation', 'capacity', 'normalize_generation', 'lat', 'long', 'tilt',
       'orientation', 'dlwrf', 'dswrf', 'duvrs', 'hcc', 'lcc', 'mcc', 'sde',
       'sr', 't2m', 'tcc', 'u10', 'u100', 'v10', 'v100'],
      dtype='object', name='variable')

In [32]:
final_df.to_csv("result_data/tft_data_36_final.csv")

In [33]:
tft_data = pd.read_csv("result_data/tft_data_36_final.csv")

In [34]:
tft_data

Unnamed: 0.1,Unnamed: 0,ss_id,init_time,step,pv_datetime,pv_hour,horizon,generation,capacity,normalize_generation,...,lcc,mcc,sde,sr,t2m,tcc,u10,u100,v10,v100
0,0,2631.0,2021-10-19 12:00:00,0 days 00:00:00,2021-10-19 18:00:00,18.0,1.0,0.00000,4.0,0.000000,...,1.000000,0.943909,0.0,0.0,290.23486,1.000000,3.040367,5.269863,6.780065,10.418222
1,1,2631.0,2021-10-19 12:00:00,0 days 01:00:00,2021-10-19 19:00:00,19.0,2.0,0.00000,4.0,0.000000,...,1.000000,0.935089,0.0,2240.0,290.45288,1.000000,3.167645,5.255135,7.176521,10.833017
2,2,2631.0,2021-10-19 12:00:00,0 days 02:00:00,2021-10-19 20:00:00,20.0,3.0,0.00000,4.0,0.000000,...,1.000000,0.999725,0.0,4928.0,290.40283,1.000000,2.951848,4.813256,7.045044,10.617472
3,3,2631.0,2021-10-19 12:00:00,0 days 03:00:00,2021-10-19 21:00:00,21.0,4.0,0.00000,4.0,0.000000,...,0.999969,0.967377,0.0,2560.0,289.80420,1.000000,2.746724,4.556017,7.446497,11.571623
4,4,2631.0,2021-10-19 12:00:00,0 days 04:00:00,2021-10-19 22:00:00,22.0,5.0,0.00000,4.0,0.000000,...,1.000000,0.993530,0.0,4096.0,289.78516,1.000000,3.175217,5.447397,7.159049,11.096762
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51943,51943,27064.0,2021-10-11 00:00:00,1 days 07:00:00,2021-10-12 11:00:00,11.0,32.0,908.97120,2.0,454.485596,...,0.069977,0.841431,0.0,181760.0,281.04980,0.946198,2.619058,4.780936,-1.124188,-3.762029
51944,51944,27064.0,2021-10-11 00:00:00,1 days 08:00:00,2021-10-12 12:00:00,12.0,33.0,1400.04000,2.0,700.020020,...,0.089294,0.284546,0.0,289280.0,282.81665,0.891418,3.065713,4.840455,-1.597640,-3.979417
51945,51945,27064.0,2021-10-11 00:00:00,1 days 09:00:00,2021-10-12 13:00:00,13.0,34.0,254.46000,2.0,127.230003,...,0.084595,0.744873,0.0,920064.0,284.67114,0.904755,3.102398,4.357941,-2.269724,-3.671457
51946,51946,27064.0,2021-10-11 00:00:00,1 days 10:00:00,2021-10-12 14:00:00,14.0,35.0,266.73720,2.0,133.368607,...,0.073120,0.917603,0.0,669184.0,285.73510,0.933502,3.113303,4.248115,-3.022680,-4.408169


In [36]:
tft_data = tft_data.drop(columns=['Unnamed: 0'])

In [37]:
rows_per_batch = 36
num_batches_to_keep = 100

# Calculate the number of rows to keep separately
rows_to_keep = rows_per_batch * num_batches_to_keep
train_data = tft_data[:-rows_to_keep]  # All rows except the last `rows_to_keep` rows
test_data = tft_data[-rows_to_keep:]  # The last `rows_to_keep` rows


In [39]:
print("Training DataFrame shape:", train_data.shape)
print("Validation DataFrame shape:", test_data.shape)

Training DataFrame shape: (48348, 27)
Validation DataFrame shape: (3600, 27)


In [40]:
train_data.to_csv("result_data/tft_data_36_train.csv")
test_data.to_csv("result_data/tft_data_36_test.csv")