In [1]:
import os
import numpy as np
import pandas as pd
import h5py

# --- Parameters ---
input_step_size = 50
output_size = 30  # Must match baseline output length
sliding_window = False  # Can set True if you want overlapping samples
symbol = "BTCUSDT"

# Paths
input_csv = f"data/{symbol}_2018.csv"
output_dir = "data/processed"
os.makedirs(output_dir, exist_ok=True)
file_name = f"{output_dir}/{symbol.lower()}_{input_step_size}_{output_size}_prediction.h5"

# --- Load CSV and preprocess ---
df = pd.read_csv(input_csv).dropna()
df['Datetime'] = pd.to_datetime(df['Open Time'])
df = df.reset_index(drop=True)

prices = df['Close'].values.astype(np.float32)
times = df['Datetime'].values  # datetime64[ns]

inputs, outputs = [], []
input_times, output_times = [], []

if sliding_window:
    # Overlapping windows: slide 1 step at a time
    for i in range(len(prices) - input_step_size - output_size + 1):
        inputs.append(prices[i:i + input_step_size])
        input_times.append(times[i:i + input_step_size])
        outputs.append(prices[i + input_step_size: i + input_step_size + output_size])
        output_times.append(times[i + input_step_size: i + input_step_size + output_size])
else:
    # Non-overlapping windows: jump input_step_size each time
    total_length = len(prices)
    max_start = total_length - input_step_size - output_size
    for i in range(0, max_start + 1, input_step_size):
        inputs.append(prices[i:i + input_step_size])
        input_times.append(times[i:i + input_step_size])
        outputs.append(prices[i + input_step_size: i + input_step_size + output_size])
        output_times.append(times[i + input_step_size: i + input_step_size + output_size])

# Convert lists to numpy arrays
inputs = np.array(inputs)
outputs = np.array(outputs)
input_times = np.array(input_times)
output_times = np.array(output_times)

# --- Save data to HDF5 and times to npy ---
with h5py.File(file_name, 'w') as f:
    f.create_dataset("inputs", data=inputs)
    f.create_dataset("outputs", data=outputs)

np.save(file_name.replace('.h5', '_input_times.npy'), input_times)
np.save(file_name.replace('.h5', '_output_times.npy'), output_times)

print(f"✅ Saved: {file_name}")
print(f"📊 Inputs shape: {inputs.shape}, Outputs shape: {outputs.shape}")


✅ Saved: data/processed/btcusdt_50_30_prediction.h5
📊 Inputs shape: (6290, 50), Outputs shape: (6290, 30)
