## Load the data

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("../data/Final_Btc_Data_with_Indicators.csv", parse_dates=['Date'])

In [3]:
df.head()

Unnamed: 0,Date,Price,Open,High,Low,Vol.,Change %,EPU,GPRD,Gold,MA50,EMA50,OBV,MACD,MACD Signal,PSAR
0,2014-04-01,463.5,444.7,487.4,438.0,3970.0,0.0423,111.578388,120.928154,1279.599976,463.5,463.5,3970.0,0.0,0.0,463.5
1,2014-04-02,424.4,463.5,480.3,409.3,4380.0,-0.0843,111.578388,94.748909,1290.5,443.95,461.966667,-410.0,-3.119088,-0.623818,424.4
2,2014-04-03,436.3,424.4,448.7,386.7,4020.0,0.0281,111.578388,63.297474,1284.400024,441.4,460.960131,3610.0,-4.577986,-1.414651,487.4
3,2014-04-04,444.4,436.3,456.5,415.5,2570.0,0.0185,111.578388,59.551781,1303.199951,442.15,460.310714,6180.0,-5.022672,-2.136255,485.386
4,2014-04-05,456.6,444.4,461.2,439.4,1160.0,0.0276,111.578388,60.448311,1303.199951,445.04,460.165196,7340.0,-4.340615,-2.577127,483.41228


## Null values

In [4]:
df.isnull().sum()

Date           0
Price          0
Open           0
High           0
Low            0
Vol.           0
Change %       0
EPU            0
GPRD           0
Gold           0
MA50           0
EMA50          0
OBV            0
MACD           0
MACD Signal    0
PSAR           0
dtype: int64

## Select features for modeling

In [5]:
target_col = "Price"
features = [col for col in df.columns if col != target_col and col != "Date"]

## Get train df

In [6]:
train_size = int(len(df) * 0.7)

train_df = df[:train_size]

## Scale the features

In [7]:
from sklearn.preprocessing import StandardScaler
import pickle

feature_scaler = StandardScaler()
target_scaler = StandardScaler()

feature_scaler.fit(train_df[features])
target_scaler.fit(train_df[[target_col]])

0,1,2
,copy,True
,with_mean,True
,with_std,True


In [8]:
with open("../models/scaler/feature_scaler.pkl", "wb") as f:
    pickle.dump(feature_scaler, f)

with open("../models/scaler/target_scaler.pkl", "wb") as f:
    pickle.dump(target_scaler, f)

In [9]:
df[features] = feature_scaler.transform(df[features])
df[target_col] = target_scaler.transform(df[[target_col]]).flatten()

In [10]:
df.head()

Unnamed: 0,Date,Price,Open,High,Low,Vol.,Change %,EPU,GPRD,Gold,MA50,EMA50,OBV,MACD,MACD Signal,PSAR
0,2014-04-01,-0.619852,-0.623591,-0.6168,-0.626223,-0.771881,1.013036,-1.079547,0.583055,-0.343253,-0.67575,-0.675218,-1.001312,-0.233187,-0.241462,-0.623429
1,2014-04-02,-0.624188,-0.621492,-0.617565,-0.629553,-0.77016,-2.224378,-1.079547,-0.020825,-0.293792,-0.678356,-0.675421,-1.001985,-0.238152,-0.242506,-0.627906
2,2014-04-03,-0.622868,-0.625858,-0.620967,-0.632176,-0.771671,0.649914,-1.079547,-0.74632,-0.321472,-0.678696,-0.675554,-1.001368,-0.240475,-0.24383,-0.620692
3,2014-04-04,-0.62197,-0.624529,-0.620127,-0.628834,-0.777755,0.404422,-1.079547,-0.832722,-0.236164,-0.678596,-0.67564,-1.000973,-0.241183,-0.245038,-0.620923
4,2014-04-05,-0.620617,-0.623625,-0.619621,-0.62606,-0.783671,0.637128,-1.079547,-0.812042,-0.236164,-0.67821,-0.675659,-1.000795,-0.240097,-0.245776,-0.621149


## Creating supervised sequences

In [11]:
def create_sequences(data, target, seq_len, horizon):
    X, y = [], []
    for i in range(len(data) - seq_len - horizon + 1):
        X.append(data[i:i+seq_len])
        y.append(target[i+seq_len + horizon - 1])  # target at t + horizon
    return np.array(X), np.array(y)

In [12]:
seq_len = 5   # or 15 for the second case
horizon = 5   # can be same as seq_len or different
X, y = create_sequences(df[features].values, df[target_col].values, seq_len, horizon)

In [14]:
X[0], y[0]

(array([[-0.62359133, -0.61680012, -0.62622272, -0.77188052,  1.01303578,
         -1.07954707,  0.58305492, -0.34325252, -0.67575022, -0.67521784,
         -1.00131249, -0.23318695, -0.24146207, -0.62342905],
        [-0.62149232, -0.61756455, -0.62955314, -0.77016025, -2.22437798,
         -1.07954707, -0.02082522, -0.29379186, -0.67835568, -0.67542087,
         -1.00198548, -0.23815234, -0.2425062 , -0.62790642],
        [-0.62585781, -0.62096682, -0.6321757 , -0.77167073,  0.64991354,
         -1.07954707, -0.74631968, -0.3214715 , -0.67869552, -0.67555414,
         -1.0013678 , -0.24047482, -0.24382989, -0.62069223],
        [-0.62452919, -0.62012702, -0.62883368, -0.77775461,  0.40442245,
         -1.07954707, -0.83272209, -0.23616374, -0.67859557, -0.67564012,
         -1.00097291, -0.24118273, -0.2450377 , -0.62092286],
        [-0.62362482, -0.61962098, -0.62606026, -0.78367065,  0.63712755,
         -1.07954707, -0.81204171, -0.23616374, -0.67821042, -0.67565939,
         -1.

## Train-test split

In [22]:
train_size = int(len(df) * 0.7)
val_size = int(len(df) * 0.1)

X_train, y_train = X[:train_size], y[:train_size]
X_val, y_val = X[train_size:train_size + val_size], y[train_size:train_size + val_size]
X_test, y_test = X[train_size + val_size:], y[train_size + val_size:]

## Wrap in pytorch dataloaders

In [30]:
import torch
from torch.utils.data import TensorDataset, DataLoader

train_ds = TensorDataset(torch.tensor(X_train, dtype=torch.float32), torch.tensor(y_train, dtype=torch.float32))
val_ds = TensorDataset(torch.tensor(X_val, dtype=torch.float32), torch.tensor(y_val, dtype=torch.float32))
test_ds = TensorDataset(torch.tensor(X_test, dtype=torch.float32), torch.tensor(y_test, dtype=torch.float32))

train_loader = DataLoader(train_ds, batch_size=256, shuffle=False)  # No shuffle for time series
val_loader = DataLoader(val_ds, batch_size=256, shuffle=False)
test_loader = DataLoader(test_ds, batch_size=256, shuffle=False)


In [36]:
print("Train shape:", X_train.shape, y_train.shape)
print("Val shape:", X_val.shape, y_val.shape)
print("Test shape:", X_test.shape, y_test.shape)

Train shape: (2557, 5, 14) (2557,)
Val shape: (365, 5, 14) (365,)
Test shape: (723, 5, 14) (723,)
