# Baseline Features

The baseline features include: `Timestamp`, `Open`, `High`, `Low`, `Close`, `Volume`.
- Working with the full index dataset (7.0M entries) can lead to excessive training times.
- We'll restrict our dataset to the most recent `100,000` data points, which are likely more relevant for future predictions.
- Note that a `training period` of only two months may not capture all relevant trends, despite containing substantial data.

In [None]:
import os
import pandas as pd

PROCESSED_PATH = "data/processed/"
PROCESSED_NAME = "btcusd_1-min_data_processed.csv"
PROCESSED_FILE = os.path.join(PROCESSED_PATH, PROCESSED_NAME)

df = pd.read_csv(PROCESSED_FILE)
df["Timestamp"] = pd.to_datetime(df["Timestamp"], unit="s", utc=True)
df.set_index("Timestamp", inplace=True)
df.drop(columns=["datetime"], inplace=True)
df.info()

In [None]:
# Chronological split with 60/20/20
train_size = int(0.6 * len(df))
valid_size = int(0.2 * len(df))

# Split in chronological order
train_data = df.iloc[:train_size]
valid_data = df.iloc[train_size : train_size + valid_size]
test_data = df.iloc[train_size + valid_size :]

print(f"Training set: {len(train_data)} entries")
print(f"Validation set: {len(valid_data)} entries")
print(f"Test set: {len(test_data)} entries")

# Check time ranges
print(f"Training set: {train_data.index.min()} to {train_data.index.max()}")
print(f"Validation set: {valid_data.index.min()} to {valid_data.index.max()}")
print(f"Test set: {test_data.index.min()} to {test_data.index.max()}")

# Saving the split data records
train_data.to_csv(os.path.join(PROCESSED_PATH, "train_data.csv"))
valid_data.to_csv(os.path.join(PROCESSED_PATH, "validation_data.csv"))
test_data.to_csv(os.path.join(PROCESSED_PATH, "test_data.csv"))