In [12]:
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
import pandas as pd
import numpy as np


In [13]:
df = pd.read_csv("../data/btc_15m_data_2018_to_2025.csv")

In [14]:
df = df[['Close time', 'Open', 'High', 'Low', 'Close', 'Volume']]
df['Close time'] = pd.to_datetime(df['Close time'])

df.head()

Unnamed: 0,Close time,Open,High,Low,Close,Volume
0,2018-01-01 00:14:59.999,13715.65,13715.65,13400.01,13556.15,123.616013
1,2018-01-01 00:29:59.999,13533.75,13550.87,13402.0,13521.12,98.13643
2,2018-01-01 00:44:59.999,13500.0,13545.37,13450.0,13470.41,79.904037
3,2018-01-01 00:59:59.999,13494.65,13690.87,13450.0,13529.01,141.699719
4,2018-01-01 01:14:59.999,13528.99,13571.74,13402.28,13445.63,72.537533


In [15]:

def calculate_macd(df, fast=12, slow=26, signal=9):
    df['EMA_fast'] = df['Close'].ewm(span=fast, adjust=False).mean()
    df['EMA_slow'] = df['Close'].ewm(span=slow, adjust=False).mean()
    df['MACD'] = df['EMA_fast'] - df['EMA_slow']
    df['Signal_Line'] = df['MACD'].ewm(span=signal, adjust=False).mean()
    df['MACD_Histogram'] = df['MACD'] - df['Signal_Line']
    df.drop(columns=['EMA_fast', 'EMA_slow', 'MACD', 'Signal_Line'], inplace=True)
    return df

def bolinger_bands(df, window=20, std=2):
    df['SMA'] = df['Close'].rolling(window=window).mean()
    df['BB_up'] = df['SMA'] + (df['Close'].rolling(window=window).std() * std)
    df['BB_down'] = df['SMA'] - (df['Close'].rolling(window=window).std() * std)
    return df

def on_balance_volume(df):
    df['OBV'] = np.where(df['Close'] > df['Close'].shift(1), df['Volume'], np.where(df['Close'] < df['Close'].shift(1), -df['Volume'], 0)).cumsum()
    return df

def comodiity_channel_index(df, window=14):
    df['CCI'] = (df['Close'] - df['Close'].rolling(window=window).mean()) / (0.015 * df['Close'].rolling(window=window).std())
    return df

def calculate_rsi(df, window=14):
    delta = df['Close'].diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=window).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=window).mean()
    rs = gain / loss
    df['RSI'] = 100 - (100 / (1 + rs))
    return df


def calculate_future_price(df, window=5):
    def weight_average(x):
        return np.dot(x, [0.33333333, 0.26666667, 0.2, 0.13333333, 0.06666667])
    df['future_price'] = df['Close'].rolling(window=window).apply(weight_average, raw=True).shift(-window)
    return df

def adding_features(df):
    df = calculate_macd(df)
    df = bolinger_bands(df)
    df = calculate_rsi(df)
    df = on_balance_volume(df)
    df = comodiity_channel_index(df)
    df = calculate_future_price(df)
    return df
adding_features(df)

Unnamed: 0,Close time,Open,High,Low,Close,Volume,MACD_Histogram,SMA,BB_up,BB_down,RSI,OBV,CCI,future_price
0,2018-01-01 00:14:59.999,13715.65,13715.65,13400.01,13556.15,123.616013,0.000000,,,,,0.000000,,13501.702000
1,2018-01-01 00:29:59.999,13533.75,13550.87,13402.00,13521.12,98.136430,-2.235533,,,,,-98.136430,,13490.999333
2,2018-01-01 00:44:59.999,13500.00,13545.37,13450.00,13470.41,79.904037,-6.750630,,,,,-178.040467,,13479.376666
3,2018-01-01 00:59:59.999,13494.65,13690.87,13450.00,13529.01,141.699719,-5.466529,,,,,-36.340748,,13438.753334
4,2018-01-01 01:14:59.999,13528.99,13571.74,13402.28,13445.63,72.537533,-9.663292,,,,,-108.878281,,13411.214666
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
252616,2025-03-21 23:14:59.999,84185.38,84226.26,84090.54,84226.26,112.679830,-3.845796,84166.0810,84477.829204,83854.332796,67.406091,107933.428644,9.410030,
252617,2025-03-21 23:29:59.999,84226.25,84226.26,84119.52,84128.64,35.561860,-12.312173,84166.3810,84477.964692,83854.797308,58.380682,107897.866784,-43.984623,
252618,2025-03-21 23:44:59.999,84128.63,84128.88,84076.49,84077.91,44.487950,-20.845413,84155.5215,84463.332395,83847.710605,58.306936,107853.378834,-92.274626,
252619,2025-03-21 23:59:59.999,84077.91,84098.14,84050.16,84088.79,38.438070,-24.993916,84154.8780,84463.216365,83846.539635,49.876760,107891.816904,-85.402538,


In [16]:
df['predict_trend'] = df['future_price'] - df['Close']
df["BB_up_diff"] = df["BB_up"] - df["Close"]
df["BB_down_diff"] = df["BB_down"] - df["Close"]
def create_obv_zscore(df, window=20):
    df['OBV_mean'] = df['OBV'].rolling(window=window).mean()
    df['OBV_std'] = df['OBV'].rolling(window=20).std()
    df['OBV_Z'] = (df['OBV'] - df['OBV_mean']) / df['OBV_std']
    return df.drop(['OBV_mean', 'OBV_std'], axis=1)
df = create_obv_zscore(df)
df = df[[ 'Close time', 'Open', 'High', 'Low' ,'Close', 'BB_up_diff', 'BB_down_diff', 'OBV_Z', 'MACD_Histogram', 'RSI', 'CCI', 'predict_trend']]

In [None]:
## might want to remove open, close , high, low
def final_preprocess_data(train_df):
    for col in ['Open', 'High', 'Low', 'Close']:
        train_df[f'{col}_log_return'] = np.log(train_df[col] / train_df[col].shift(1))
    
    train_df['price_mean'] = train_df[['Open', 'High', 'Low', 'Close']].mean(axis=1)
    train_df['price_std'] = train_df[['Open', 'High', 'Low', 'Close']].std(axis=1)
    train_df['price_range'] = train_df['High'] - train_df['Low']
    

    std_cols = ['MACD_Histogram', 'CCI']
    std_scaler = StandardScaler().fit(train_df[std_cols])
    train_df[std_cols] = std_scaler.transform(train_df[std_cols])
    

    train_df['RSI'] = np.clip(train_df['RSI'], 30, 70)
    rsi_scaler = MinMaxScaler(feature_range=(0, 1)).fit(train_df[['RSI']])
    train_df['RSI'] = rsi_scaler.transform(train_df[['RSI']])
    
    
    for col in ['RSI', 'MACD_Histogram', 'CCI', 'predict_trend']:
        for lag in range(1, 10):  
            train_df[f'{col}_lag_{lag}'] = train_df[col].shift(lag)
    
   
    train_df.dropna(inplace=True)
    
    return train_df

df = final_preprocess_data(df)
sc = StandardScaler()
df['predict_trend'] = sc.fit_transform(df[['predict_trend']])
y = df['predict_trend']
x = df.drop(columns=['predict_trend'])
y = np.nan_to_num(y, nan=0.0, posinf=0.0, neginf=0.0)
x = np.nan_to_num(x, nan=0.0, posinf=0.0, neginf=0.0)

In [19]:
df.head()

Unnamed: 0,Close time,Open,High,Low,Close,BB_up_diff,BB_down_diff,OBV_Z,MACD_Histogram,RSI,...,CCI_lag_9,predict_trend_lag_1,predict_trend_lag_2,predict_trend_lag_3,predict_trend_lag_4,predict_trend_lag_5,predict_trend_lag_6,predict_trend_lag_7,predict_trend_lag_8,predict_trend_lag_9
22,2018-01-01 05:44:59.999,13618.51,13629.0,13530.0,13580.01,122.621288,-375.226288,0.94514,0.6597,0.733648,...,0.887338,-29.779333,-37.676,-1.005333,98.466667,145.969334,171.987334,45.299334,-75.812,-124.859333
23,2018-01-01 05:59:59.999,13558.95,13600.0,13526.5,13558.99,147.400432,-354.967432,0.597146,0.559312,0.96824,...,0.507227,34.180667,-29.779333,-37.676,-1.005333,98.466667,145.969334,171.987334,45.299334,-75.812
24,2018-01-01 06:14:59.999,13539.0,13571.63,13510.0,13532.32,180.690617,-326.248617,0.160455,0.427014,0.821151,...,-0.069258,102.068001,34.180667,-29.779333,-37.676,-1.005333,98.466667,145.969334,171.987334,45.299334
25,2018-01-01 06:29:59.999,13532.0,13701.13,13510.51,13687.76,48.197572,-491.859572,0.771593,0.55471,0.961494,...,-0.428818,195.540667,102.068001,34.180667,-29.779333,-37.676,-1.005333,98.466667,145.969334,171.987334
26,2018-01-01 06:44:59.999,13687.76,13750.0,13620.01,13722.47,50.480617,-535.316617,1.0138,0.655736,0.890403,...,0.197149,52.236,195.540667,102.068001,34.180667,-29.779333,-37.676,-1.005333,98.466667,145.969334
