In [1]:
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
import numpy as np
import joblib

In [2]:
filename = "KRW-XRP-5m-full"

df_origin = pd.read_csv(f"../../data/{filename}.csv")

input_features = [
    "high_price",
    "low_price",
    "trade_price",
    "candle_acc_trade_volume",
    "rsi_14",
    "macd_histogram",
]

print(len(df_origin))

700000


In [3]:
def extract_time_features(df):
    # 필요 없는 컬럼 제거
    df.drop(columns=['market','candle_date_time_utc','candle_date_time_kst', 'opening_price', 'timestamp','candle_acc_trade_price'], inplace=False)
    
    # RSI, OBV 계산
    df['rsi_14'] = compute_rsi(df['trade_price'], window=14)
    # df['obv'] = compute_obv(df)
    
    # MACD 선 계산
    exp1 = df['trade_price'].ewm(span=12, adjust=False).mean()
    exp2 = df['trade_price'].ewm(span=26, adjust=False).mean()
    macd = exp1 - exp2
    
    # Signal 선 계산 (MACD 선의 9기간 EMA)
    macd_signal = macd.ewm(span=9, adjust=False).mean()

    # MACD 히스토그램 계산
    df['macd_histogram'] = macd - macd_signal


    df.dropna(inplace=False)

    return df

# RSI 계산
def compute_rsi(series, window=14):
    delta = series.diff()
    gain = (delta.where(delta > 0, 0)).fillna(0)
    loss = (-delta.where(delta < 0, 0)).fillna(0)
    avg_gain = gain.rolling(window=window, min_periods=window).mean()
    avg_loss = loss.rolling(window=window, min_periods=window).mean()
    rs = avg_gain / avg_loss
    rsi = 100 - (100 / (1 + rs))
    rsi[avg_loss == 0] = 100  # Handle division by zero
    rsi[(avg_gain == 0) & (avg_loss == 0)] = 50  # Neutral if no gain/loss
    rsi = rsi.fillna(50)  # 추가: 남아있는 NaN을 50으로 채움
    return rsi


df = extract_time_features(df_origin)

In [4]:
threshhold = 0.001


prices = df['trade_price'].values
labels = np.zeros(len(prices))


for i in range(len(prices)-1):
    current_price = prices[i]
    next_price = prices[i+1]
    change = (next_price-current_price) / current_price
    if change >= threshhold: # 상승시
        labels[i] = 1 # 상승 시그널
    else:
        labels[i] = 0

df['label'] = labels

df = df.iloc[:-1].reset_index(drop=True)  # 마지막 데이터 제거하여 라벨 수 맞춤
df['label'] = labels[:-1]  # 마지막 라벨 제거하여 시퀀스 생성과 일치시킴


print(f"라벨 1의 비율: {(df['label'] == 1).mean():.4f}")
print(f"라벨 0의 비율: {(df['label'] == 0).mean():.4f}")

라벨 1의 비율: 0.2884
라벨 0의 비율: 0.7116


In [5]:
# 스케일러 초기화
scaler = MinMaxScaler()

# 입력 피처 스케일링
scaled_inputs = scaler.fit_transform(df[input_features])

joblib.dump(scaler, f"../../pickles/2025-01-16/{filename}.pkl")

['../../pickles/2025-01-16/KRW-XRP-5m-full.pkl']

In [6]:
def create_classification_sequences(data_array, labels_array, seq_length=60):
    Xs, ys = [], []
    for i in range(len(data_array) - seq_length):
        X_seq = data_array[i:i+seq_length]
        y_val = labels_array[i+seq_length]  # 시퀀스 끝난 시점의 라벨
        Xs.append(X_seq)
        ys.append(y_val)
    return np.array(Xs), np.array(ys)

SEQ_LENGTH = 60

X, y = create_classification_sequences(scaled_inputs, labels, seq_length=SEQ_LENGTH)

np.save(f"../../preprocessed/2025-01-16/{filename}-X.npy",X)
np.save(f"../../preprocessed/2025-01-16/{filename}-y.npy",y)