![QuantConnect Logo](https://cdn.quantconnect.com/web/i/icon.png)
<hr>

In [2]:
# autoload newest code from files
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
from datetime import timedelta,datetime
from utils import load_trade_data, load_quote_data
from settings import DATA_DIR, START_DATE, END_DATE, TICKERS


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
# QuantBook Analysis Tool 
# For more information see [https://www.quantconnect.com/docs/v2/our-platform/research/getting-started]
qb = QuantBook()
spy = qb.add_equity("SPY")
# Locally Lean installs free sample data, to download more data please visit https://www.quantconnect.com/docs/v2/lean-cli/datasets/downloading-data 
qb.set_start_date(2013, 10, 11)
history = qb.history(qb.securities.keys(), 360, Resolution.DAILY)

# Indicator Analysis
bbdf = qb.indicator(BollingerBands(30, 2), spy.symbol, 360, Resolution.DAILY)
bbdf.drop('standarddeviation', axis=1).plot()

In [3]:
trade_df=load_trade_data("ETHUSDT", START_DATE, END_DATE)
trade_df

Loading data for ETHUSDT from ../../../data/cryptofuture/binance/minute/ethusdt...


Unnamed: 0_level_0,open,high,low,close,volume
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2026-02-03 00:00:00,2345.56,2346.90,2344.00,2344.00,2253.409
2026-02-03 00:01:00,2344.01,2344.36,2339.00,2340.61,9033.385
2026-02-03 00:02:00,2340.60,2340.79,2336.01,2337.49,4593.464
2026-02-03 00:03:00,2337.49,2338.95,2337.01,2338.10,2583.138
2026-02-03 00:04:00,2338.09,2338.09,2332.23,2333.60,7312.809
...,...,...,...,...,...
2026-02-06 23:55:00,2062.32,2063.52,2060.27,2061.96,1310.211
2026-02-06 23:56:00,2061.96,2064.00,2061.96,2062.97,1023.145
2026-02-06 23:57:00,2062.97,2063.95,2062.68,2062.92,606.092
2026-02-06 23:58:00,2062.92,2065.09,2062.92,2063.64,1993.815


In [4]:


load_quote_data("btcusdt", START_DATE, END_DATE)


Loading quote data for btcusdt from ../../../data/cryptofuture/binance/minute/btcusdt...
Found 7 quote files.


Unnamed: 0_level_0,bid_open,bid_high,bid_low,bid_close,bid_size,ask_open,ask_high,ask_low,ask_close,ask_size
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2026-02-03 00:00:00,78692.5,78752.7,78668.9,78668.9,0,78692.5,78752.7,78668.9,78668.9,0
2026-02-03 00:01:00,78668.8,78688.8,78630.0,78650.1,0,78668.8,78688.8,78630.0,78650.1,0
2026-02-03 00:02:00,78650.0,78650.1,78543.4,78558.8,0,78650.0,78650.1,78543.4,78558.8,0
2026-02-03 00:03:00,78558.7,78585.1,78554.2,78560.1,0,78558.7,78585.1,78554.2,78560.1,0
2026-02-03 00:04:00,78560.0,78560.0,78489.1,78489.1,0,78560.0,78560.0,78489.1,78489.1,0
...,...,...,...,...,...,...,...,...,...,...
2026-02-06 23:55:00,70540.5,70588.0,70469.1,70527.5,0,70540.5,70588.0,70469.1,70527.5,0
2026-02-06 23:56:00,70527.4,70597.8,70527.4,70574.0,0,70527.4,70597.8,70527.4,70574.0,0
2026-02-06 23:57:00,70574.0,70576.7,70536.3,70556.0,0,70574.0,70576.7,70536.3,70556.0,0
2026-02-06 23:58:00,70556.1,70606.2,70550.8,70566.9,0,70556.1,70606.2,70550.8,70566.9,0


In [6]:
import pandas as pd
import numpy as np

symbol="btcusdt"

def calculate_features(symbol, interval='1min'):
    # 1. Load Data
    df_trade = load_trade_data(symbol, START_DATE, END_DATE, interval=interval)
    df_quote = load_quote_data(symbol, START_DATE, END_DATE, interval=interval)
    
    if df_trade is None or df_quote is None:
        return None
        
    # Merge trade and quote data
    df = pd.concat([df_trade, df_quote], axis=1).dropna()

    # 2. Basic Returns
    df['return'] = df['close'].pct_change()
    
    # ---------------------------------------------------------
    # 3. Winning Rate Features (Momentum)
    # ---------------------------------------------------------
    df['is_win'] = (df['return'] > 0).astype(int)
    windows = [10, 20, 50]
    for w in windows:
        df[f'win_rate_{w}'] = df['is_win'].rolling(window=w).mean()

    def get_streak(series):
        streak = pd.Series(0, index=series.index)
        current_streak = 0
        for i in range(len(series)):
            val = series.iloc[i]
            if val > 0:
                current_streak = current_streak + 1 if current_streak >= 0 else 1
            elif val < 0:
                current_streak = current_streak - 1 if current_streak <= 0 else -1
            else:
                current_streak = 0
            streak.iloc[i] = current_streak
        return streak

    df['streak'] = get_streak(df['return'])

    # ---------------------------------------------------------
    # 4. Technical Indicators
    # ---------------------------------------------------------
    def calculate_rsi(data, window=14):
        delta = data.diff()
        gain = (delta.where(delta > 0, 0)).rolling(window=window).mean()
        loss = (-delta.where(delta < 0, 0)).rolling(window=window).mean()
        rs = gain / loss
        return 100 - (100 / (1 + rs))

    df['rsi_14'] = calculate_rsi(df['close'])

    exp12 = df['close'].ewm(span=12, adjust=False).mean()
    exp26 = df['close'].ewm(span=26, adjust=False).mean()
    df['macd'] = exp12 - exp26
    df['signal'] = df['macd'].ewm(span=9, adjust=False).mean()
    df['hist'] = df['macd'] - df['signal']

    df['sma_20'] = df['close'].rolling(window=20).mean()
    df['std_20'] = df['close'].rolling(window=20).std()
    df['upper_band'] = df['sma_20'] + (df['std_20'] * 2)
    df['lower_band'] = df['sma_20'] - (df['std_20'] * 2)

    df['vol_ma_20'] = df['volume'].rolling(window=20).mean()
    df['vol_ratio'] = df['volume'] / df['vol_ma_20']

    # New Features
    # ATR (Approximate)
    high_low = df['high'] - df['low']
    high_close = np.abs(df['high'] - df['close'].shift())
    low_close = np.abs(df['low'] - df['close'].shift())
    tr = pd.concat([high_low, high_close, low_close], axis=1).max(axis=1)
    df['atr_14'] = tr.rolling(window=14).mean()
    
    df['hl_range'] = (df['high'] - df['low']) / df['close']
    df['dist_sma_20'] = (df['close'] - df['sma_20']) / df['sma_20']
    
    # Quote Features
    df['spread'] = (df['ask_close'] - df['bid_close']) / (df['bid_close'] + 1e-9)
    df['imbalance'] = (df['bid_size'] - df['ask_size']) / (df['bid_size'] + df['ask_size'] + 1e-9)

    # ---------------------------------------------------------
    # 5. Target Variables (Future Labels)
    # ---------------------------------------------------------
    df['future_return_5'] = df['close'].shift(-5) / df['close'] - 1

    # Define 5 classes
    # 2: up++, 1: up+, 0: unrecognized, -1: down-, -2: down--
    def generate_multiclass_label(ret):
        if pd.isna(ret): return np.nan
        if ret > 0.005: return 2
        if ret > 0.001: return 1
        if ret > -0.001: return 0
        if ret > -0.005: return -1
        return -2

    df['target_class'] = df['future_return_5'].apply(generate_multiclass_label)

    # ---------------------------------------------------------
    # 6. Cleanup
    # ---------------------------------------------------------
    df_clean = df.dropna()
    
    return df_clean

final_df = calculate_features(symbol, interval='1min')
print(final_df.head())


Loading data for btcusdt from ../../../data/cryptofuture/binance/minute/btcusdt...
Loading quote data for btcusdt from ../../../data/cryptofuture/binance/minute/btcusdt...
Found 7 quote files.
                        open     high      low    close  volume  bid_open  \
time                                                                        
2026-02-03 00:49:00  78898.7  78898.8  78859.3  78859.3  28.732   78898.7   
2026-02-03 00:50:00  78859.2  78874.5  78833.4  78850.5  43.821   78859.2   
2026-02-03 00:51:00  78850.5  78850.5  78802.9  78831.1  40.973   78850.5   
2026-02-03 00:52:00  78831.0  78890.1  78800.0  78885.5  48.286   78831.0   
2026-02-03 00:53:00  78885.5  78890.7  78845.0  78886.3  26.203   78885.5   

                     bid_high  bid_low  bid_close  bid_size  ...  \
time                                                         ...   
2026-02-03 00:49:00   78898.8  78859.3    78859.3         0  ...   
2026-02-03 00:50:00   78874.5  78833.4    78850.5         0  ..

In [7]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split

if final_df is not None:
    # Select features for training
    feature_cols = [
        'win_rate_10', 'win_rate_20', 'win_rate_50', 'streak', 
        'rsi_14', 'macd', 'hist', 'vol_ratio', 'atr_14', 
        'hl_range', 'dist_sma_20', 'spread', 'imbalance'
    ]
    
    X = final_df[feature_cols]
    y = final_df['target_class']
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)
    
    print(f"Training on {len(X_train)} samples, testing on {len(X_test)} samples...")
    
    model = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    
    print("\nModel Evaluation:")
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    
    # Feature Importance
    importances = pd.Series(model.feature_importances_, index=feature_cols).sort_values(ascending=False)
    print("\nFeature Importances:")
    print(importances)
else:
    print("No data available.")


Training on 4564 samples, testing on 1142 samples...

Model Evaluation:
Accuracy: 0.2767

Classification Report:
              precision    recall  f1-score   support

        -2.0       0.00      0.00      0.00        64
        -1.0       0.27      0.63      0.38       299
         0.0       0.34      0.23      0.27       302
         1.0       0.26      0.15      0.19       390
         2.0       0.00      0.00      0.00        87

    accuracy                           0.28      1142
   macro avg       0.17      0.20      0.17      1142
weighted avg       0.25      0.28      0.24      1142


Feature Importances:
atr_14         0.179789
hl_range       0.118148
macd           0.114639
hist           0.112868
dist_sma_20    0.096250
rsi_14         0.095539
vol_ratio      0.082794
win_rate_50    0.065845
streak         0.049027
win_rate_20    0.047162
win_rate_10    0.037939
spread         0.000000
imbalance      0.000000
dtype: float64


In [8]:
y_train, y_test

(time
 2026-02-03 00:49:00    0.0
 2026-02-03 00:50:00    0.0
 2026-02-03 00:51:00    1.0
 2026-02-03 00:52:00    1.0
 2026-02-03 00:53:00    1.0
                       ... 
 2026-02-06 04:48:00    0.0
 2026-02-06 04:49:00    0.0
 2026-02-06 04:50:00    0.0
 2026-02-06 04:51:00   -1.0
 2026-02-06 04:52:00    0.0
 Freq: min, Name: target_class, Length: 4564, dtype: float64,
 time
 2026-02-06 04:53:00    0.0
 2026-02-06 04:54:00    0.0
 2026-02-06 04:55:00    0.0
 2026-02-06 04:56:00    1.0
 2026-02-06 04:57:00    1.0
                       ... 
 2026-02-06 23:50:00   -1.0
 2026-02-06 23:51:00    0.0
 2026-02-06 23:52:00    0.0
 2026-02-06 23:53:00    0.0
 2026-02-06 23:54:00    0.0
 Freq: min, Name: target_class, Length: 1142, dtype: float64)