## OHLCV Data

In [16]:
import json
import requests
import pandas as pd
from binance.client import Client

# Load API keys from config.json
with open("config.json", "r") as file:
    config = json.load(file)

API_KEY = config["binance"]["api_key"]
API_SECRET = config["binance"]["api_secret"]

# Initialize Binance client
client = Client(API_KEY, API_SECRET)

# Fetch OHLCV data
def fetch_ohlcv(symbol="BTCUSDT", interval="5m", limit=500):
    data = client.get_klines(symbol=symbol, interval=interval, limit=limit)
    df = pd.DataFrame(data, columns=['timestamp', 'open', 'high', 'low', 'close', 'volume', 
                                     'close_time', 'quote_volume', 'trades', 
                                     'taker_base_vol', 'taker_quote_vol', 'ignore'])
    
    # Convert timestamp to datetime
    df['timestamp'] = pd.to_datetime(df['timestamp'], unit='ms')
    df[['open', 'high', 'low', 'close', 'volume']] = df[['open', 'high', 'low', 'close', 'volume']].astype(float)
    
    return df[['timestamp', 'open', 'high', 'low', 'close', 'volume']]

ohlcv_df = fetch_ohlcv()


## Orderbook Data (Liquidity & Spread)

In [17]:
def fetch_order_book(symbol="BTCUSDT", depth=10):
    order_book = client.get_order_book(symbol=symbol, limit=depth)
    
    bids = pd.DataFrame(order_book['bids'], columns=['bid_price', 'bid_size']).astype(float)
    asks = pd.DataFrame(order_book['asks'], columns=['ask_price', 'ask_size']).astype(float)
    
    # Compute bid-ask spread
    spread = asks['ask_price'].iloc[0] - bids['bid_price'].iloc[0]

    return {'bids': bids, 'asks': asks, 'spread': spread}

order_book_data = fetch_order_book()


In [27]:
order_book_data.__sizeof__

<function dict.__sizeof__>

## Funding Rates

In [24]:
def fetch_historical_funding_rates(symbol="BTCUSDT", limit=100):
    url = f"https://fapi.binance.com/fapi/v1/fundingRate?symbol={symbol}&limit={limit}"
    response = requests.get(url).json()
    
    if isinstance(response, list):
        df = pd.DataFrame(response)
        df['fundingTime'] = pd.to_datetime(df['fundingTime'], unit='ms')
        df['fundingRate'] = df['fundingRate'].astype(float)
        return df[['fundingTime', 'fundingRate']]
    else:
        print(f"Error fetching funding rates: {response}")
        return None

funding_df = fetch_historical_funding_rates()
print(funding_df.head())


              fundingTime   fundingRate
0 2024-12-29 08:00:00.000  1.000000e-04
1 2024-12-29 16:00:00.000  1.000000e-04
2 2024-12-30 00:00:00.000 -4.200000e-07
3 2024-12-30 08:00:00.001  8.791000e-05
4 2024-12-30 16:00:00.000  1.000000e-04


In [28]:
# Merge funding rate with OHLCV data
ohlcv_df = fetch_ohlcv()

# Convert timestamps for proper merging
ohlcv_df['timestamp'] = pd.to_datetime(ohlcv_df['timestamp'])
funding_df['fundingTime'] = pd.to_datetime(funding_df['fundingTime'])

# Merge based on nearest timestamps
ohlcv_df = pd.merge_asof(ohlcv_df.sort_values('timestamp'), 
                          funding_df.sort_values('fundingTime'),
                          left_on='timestamp', right_on='fundingTime')

print(ohlcv_df.head())


            timestamp       open       high        low      close     volume  \
0 2025-01-29 20:35:00  104488.64  104512.00  104066.66  104127.91  209.06861   
1 2025-01-29 20:40:00  104127.92  104296.00  103945.75  104273.26  174.54813   
2 2025-01-29 20:45:00  104273.26  104384.00  104090.91  104278.96  128.04634   
3 2025-01-29 20:50:00  104278.95  104450.52  104016.84  104303.87  122.39772   
4 2025-01-29 20:55:00  104303.86  104484.70  104187.53  104193.93  156.66794   

          fundingTime  fundingRate  
0 2025-01-29 16:00:00     0.000099  
1 2025-01-29 16:00:00     0.000099  
2 2025-01-29 16:00:00     0.000099  
3 2025-01-29 16:00:00     0.000099  
4 2025-01-29 16:00:00     0.000099  


In [25]:
funding_df

Unnamed: 0,fundingTime,fundingRate
0,2024-12-29 08:00:00.000,1.000000e-04
1,2024-12-29 16:00:00.000,1.000000e-04
2,2024-12-30 00:00:00.000,-4.200000e-07
3,2024-12-30 08:00:00.001,8.791000e-05
4,2024-12-30 16:00:00.000,1.000000e-04
...,...,...
95,2025-01-30 00:00:00.000,7.590000e-05
96,2025-01-30 08:00:00.000,1.000000e-04
97,2025-01-30 16:00:00.000,9.499000e-05
98,2025-01-31 00:00:00.000,1.225000e-05


## Feature Stability Test

In [21]:
import numpy as np

def fractional_diff(series, d=0.4, threshold=1e-5):
    weights = [1]
    for k in range(1, len(series)):
        w = -weights[-1] * ((d - k + 1) / k)
        if abs(w) < threshold:
            break
        weights.append(w)
    weights = np.array(weights[::-1]).reshape(-1, 1)
    
    return series.rolling(len(weights)).apply(lambda x: np.dot(weights.T, x), raw=True)

ohlcv_df['frac_diff_returns'] = fractional_diff(ohlcv_df['close'].pct_change())

# Rolling correlation to check stability
ohlcv_df['frac_diff_corr'] = ohlcv_df['frac_diff_returns'].rolling(100).corr(ohlcv_df['close'].pct_change())


## OrderFlow Imbalance (OFI) Stability Test

In [22]:
def order_flow_imbalance(bids, asks):
    buy_pressure = asks['ask_size'].sum()
    sell_pressure = bids['bid_size'].sum()
    
    return (buy_pressure - sell_pressure) / (buy_pressure + sell_pressure)

ohlcv_df['ofi'] = order_flow_imbalance(order_book_data['bids'], order_book_data['asks'])

# Check variance over time
ohlcv_df['ofi_var'] = ohlcv_df['ofi'].rolling(100).var()


## Predictive Power Test

Now, we’ll use XGBoost to see which features are important for position sizing.

In [45]:
import json
import time
import numpy as np
import pandas as pd
import xgboost as xgb

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

from binance.client import Client
from scipy.stats import skew, kurtosis
import pywt  # For wavelet transforms

##############################################
# 1. Load API Keys & Initialize Client
##############################################

with open("config.json", "r") as f:
    config = json.load(f)

API_KEY = config["binance"]["api_key"]
API_SECRET = config["binance"]["api_secret"]
client = Client(API_KEY, API_SECRET)

##############################################
# 2. Advanced/Exotic Feature Functions
##############################################

def fractional_diff(series, d=0.4, max_lags=50):
    """Fractionally differentiate 'series' using up to `max_lags` terms."""
    w = [1.0]
    for k in range(1, max_lags):
        w_ = -w[-1] * ((d - (k - 1)) / k)
        w.append(w_)
    w = np.array(w[::-1]).reshape(-1, 1)

    def fracdiff_apply(x):
        return np.dot(w.T, x)[0]

    return series.rolling(window=max_lags).apply(fracdiff_apply, raw=True)

def rolling_shannon_entropy(series, window=5, bins=5):
    """Rolling Shannon Entropy of 'series' distribution in each window."""
    def shannon_window(x):
        hist, _ = np.histogram(x, bins=bins, density=True)
        hist = hist[hist > 0]
        return -np.sum(hist * np.log2(hist))
    
    return series.rolling(window).apply(shannon_window, raw=False)

def rolling_skewness(series, window=30):
    """Rolling skewness (3rd moment)."""
    return series.rolling(window).apply(lambda x: skew(x), raw=False)

def rolling_kurtosis(series, window=30):
    """Rolling kurtosis (4th moment)."""
    return series.rolling(window).apply(lambda x: kurtosis(x), raw=False)

def rolling_hurst_exponent(series, window=50):
    """
    Rolling Hurst exponent with naive R/S approach.
    H > 0.5 => persistent/trending, < 0.5 => mean-reverting, ~0.5 => random.
    """
    def hurst_rs(x):
        x = pd.Series(x)
        mean_x = x.mean()
        adj = x - mean_x
        cum = adj.cumsum()
        R = cum.max() - cum.min()
        S = x.std() if x.std() != 0 else 1e-9
        N = len(x)
        return np.log(R/S)/np.log(N) if R > 0 else 0.0

    return series.rolling(window).apply(hurst_rs, raw=False)

##############################################
# 2.1. Even More Advanced (Chaos / Physics)
##############################################

def rolling_perm_entropy(series, window=10, order=3):
    """
    Rolling **Permutation Entropy**. 
    - 'order' is the embedding dimension used for ordinal patterns.
    - If series is random, PE ~ ln(order!), 
      if series is perfectly predictable, PE is smaller.
    - We'll compute it for each rolling window of length 'window'.
    """
    def perm_entropy_window(x):
        # x is the local window
        # generate ordinal patterns of length = 'order'
        # for simplicity, we skip overlapping patterns beyond what's needed
        if len(x) < order:
            return np.nan
        
        # Create permutations
        # We'll just do the simplest: consecutive ordinal patterns
        # e.g. x[i:i+order], and see how often each pattern occurs
        patterns_count = {}
        for i in range(len(x) - order + 1):
            # ordinal pattern
            sub = x[i:i+order]
            # get indices that would sort 'sub'
            pattern = tuple(np.argsort(sub))
            patterns_count[pattern] = patterns_count.get(pattern, 0) + 1
        
        total = sum(patterns_count.values())
        # Shannon entropy of the distribution of patterns
        pe = 0.0
        for c in patterns_count.values():
            p = c / total
            pe -= p * np.log2(p)
        return pe

    return series.rolling(window).apply(perm_entropy_window, raw=False)

def rolling_wavelet_energy(series, wavelet='haar', level=2, window=50):
    """
    Rolling wavelet energy. 
    1) For each rolling window, do a discrete wavelet transform up to 'level'
    2) Sum of squares of detail coeffs => 'wavelet energy'
    More advanced versions might sum across multiple decomposition levels or do continuous wavelet transforms.
    """
    def wavelet_window(x):
        # Basic DWT
        # pywt.wavedec returns a list [cA_{level}, cD_{level}, ..., cD_{1}]
        coeffs = pywt.wavedec(x, wavelet, level=level)
        # We can measure energy as sum of squares of detail coefficients
        # e.g. cD_1, cD_2, etc.
        energy = 0.0
        # skip the approximation at index 0, use detail coeffs from index 1
        for c in coeffs[1:]:
            energy += np.sum(np.array(c)**2)
        return energy
    
    return series.rolling(window).apply(wavelet_window, raw=True)

def rolling_lyapunov_exponent(series, window=50, tau=1, dim=2):
    """
    Approximate Largest Lyapunov Exponent (LLE) in a rolling window 
    using a simple Rosenstein method.
    This is a *very rough* approach.
    - 'tau' is time delay
    - 'dim' is embedding dimension
    Realistically, you'd want a more robust library or method. 
    """
    def lyapunov_window(x):
        x = np.array(x)
        # embed the time series in dimension=dim, time delay=tau
        # We'll do minimal checks
        N = len(x) - (dim-1)*tau
        if N < 2:
            return np.nan

        # Build embedded vectors
        embedded = []
        for i in range(N):
            # each vector is [x[i], x[i+tau], ..., x[i+(dim-1)*tau]]
            v = x[i : i + dim*tau : tau]
            embedded.append(v)
        embedded = np.array(embedded)

        # For each point, find nearest neighbor
        # This is O(N^2) but okay for small windows
        dists = []
        for i in range(N):
            dmin = 1e20
            for j in range(N):
                if j == i: 
                    continue
                dist_ij = np.linalg.norm(embedded[i] - embedded[j])
                if dist_ij < dmin:
                    dmin = dist_ij
            dists.append(dmin)
        
        # For a real method, we'd track how distances evolve over time steps,
        # but let's do a naive measure: average log(dmin)
        avg_log_dist = np.mean(np.log(np.array(dists) + 1e-9))
        # We'll treat that as a proxy for chaos => higher => more chaotic
        return avg_log_dist

    return series.rolling(window).apply(lyapunov_window, raw=False)

def rolling_tsallis_entropy(series, window=30, q=1.5):
    """
    Tsallis entropy: a generalized entropy measure from non-extensive thermodynamics.
    For q=1, it becomes Shannon. For q!=1, there's a 'q' parameter controlling concavity.
    We'll compute it in a naive histogram approach for each rolling window.
    """
    def tsallis_window(x):
        hist, _ = np.histogram(x, bins=20, density=True)
        hist = hist[hist>0]
        # Tsallis S_q = (1 - sum(p_i^q)) / (q - 1)
        p_q = hist**q
        return (1.0 - np.sum(p_q)) / (q - 1.0)
    
    return series.rolling(window).apply(tsallis_window, raw=False)

##############################################
# 3. Fetching OHLCV with get_historical_klines
##############################################

def fetch_ohlcv(symbol="BTCUSDT", interval="5m", start_str="30 days ago UTC"):
    """
    python-binance's get_historical_klines automatically loops 
    for more than 500 bars, up to the current time.
    """
    klines = client.get_historical_klines(
        symbol=symbol,
        interval=interval,
        start_str=start_str,
        limit=1500
    )
    if not klines:
        return pd.DataFrame()

    df = pd.DataFrame(klines, columns=[
        "timestamp", "open", "high", "low", "close", "volume",
        "close_time", "quote_volume", "trades",
        "taker_base_vol", "taker_quote_vol", "ignore"
    ])
    df["timestamp"] = pd.to_datetime(df["timestamp"], unit="ms")
    for c in ["open","high","low","close","volume"]:
        df[c] = df[c].astype(float)
    df = df[["timestamp","open","high","low","close","volume"]].copy()
    df.sort_values("timestamp", inplace=True)
    df.reset_index(drop=True, inplace=True)
    return df

##############################################
# 4. Data Pipeline with "Math Olympiad" Features
##############################################

def run_data_pipeline(symbol="BTCUSDT", interval="5m", days=30):
    """
    Fetch data, compute multiple advanced features:
     1) price_returns, fractional_diff
     2) rolling Shannon entropy
     3) skew, kurtosis, hurst
     4) permutation entropy
     5) wavelet energy
     6) approximate Lyapunov exponent
     7) Tsallis entropy
    """
    print("=== Building Dataset ===")
    start_str = f"{days} days ago UTC"
    ohlcv_df = fetch_ohlcv(symbol, interval, start_str=start_str)
    print("OHLCV rows:", len(ohlcv_df))

    # Basic returns
    ohlcv_df["price_returns"] = ohlcv_df["close"].pct_change()

    # Fractional difference
    ohlcv_df["frac_diff_returns"] = fractional_diff(ohlcv_df["price_returns"], d=0.4, max_lags=50)

    # Rolling Shannon entropy
    ohlcv_df["entropy"] = rolling_shannon_entropy(ohlcv_df["price_returns"], window=5, bins=5)

    # Rolling Skew & Kurt
    ohlcv_df["skew_30"] = rolling_skewness(ohlcv_df["price_returns"], window=30)
    ohlcv_df["kurt_30"] = rolling_kurtosis(ohlcv_df["price_returns"], window=30)

    # Rolling Hurst exponent
    ohlcv_df["hurst_50"] = rolling_hurst_exponent(ohlcv_df["price_returns"], window=50)

    # Permutation entropy
    ohlcv_df["perm_entropy_10"] = rolling_perm_entropy(ohlcv_df["price_returns"], window=10, order=3)

    # Wavelet energy (haar wavelet, level=2)
    ohlcv_df["wavelet_energy"] = rolling_wavelet_energy(ohlcv_df["price_returns"], wavelet='haar', level=2, window=50)

    # Largest Lyapunov exponent (approx)
    ohlcv_df["lyapunov_50"] = rolling_lyapunov_exponent(ohlcv_df["price_returns"], window=50, tau=1, dim=2)

    # Tsallis entropy
    ohlcv_df["tsallis_30"] = rolling_tsallis_entropy(ohlcv_df["price_returns"], window=30, q=1.5)

    print("Final rows (post-feature):", len(ohlcv_df))
    return ohlcv_df

##############################################
# 5. XGBoost Regressor (Focus on Model, No Backtest)
##############################################

def train_xgboost_regressor(df):
    """
    Train an XGBoost regressor to predict next-candle return
    from the advanced feature set. We won't do a backtest here,
    just measure MSE / R^2 for the model.
    """
    print("\n=== Training XGBoost for Next-Candle Return Prediction ===")

    # Build a big feature list
    features = [
        "frac_diff_returns",
        "entropy",
        "skew_30",
        "kurt_30",
        "hurst_50",
        "perm_entropy_10",
        "wavelet_energy",
        "lyapunov_50",
        "tsallis_30",
    ]

    # Define target = next candle's return
    df["next_return"] = df["price_returns"].shift(-1)

    # Drop rows with NaNs in features or target
    print("Before dropna, rows:", len(df))
    df.dropna(subset=features + ["next_return"], inplace=True)
    print("After dropna, rows:", len(df))

    if len(df) < 10:
        raise ValueError("Not enough data left after dropping NaNs!")

    X = df[features]
    y = df["next_return"]

    # For a robust approach, use TimeSeriesSplit or walk-forward.
    # But here we do a simple train_test_split for demonstration.
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    model = xgb.XGBRegressor(
        n_estimators=200,
        max_depth=5,
        learning_rate=0.05,
        random_state=42
    )
    model.fit(X_train, y_train)

    # Evaluate
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f"Test MSE: {mse:.8f}")
    print(f"Test R^2: {r2:.6f}")

    # Feature importances
    feat_imp = model.get_booster().get_score(importance_type="weight")
    print("Feature Importance:", feat_imp)

    return model

#############################################################
# 6. Example Notebook Usage (Focus on Model, Not Backtest)
#############################################################

symbol = "BTCUSDT"
interval = "5m"
days = 50  # or 30, etc. Increase for more data

# 1) Build dataset with advanced "math olympiad" features
df_ohlcv = run_data_pipeline(symbol, interval, days=days)

# 2) Train XGBoost Regressor
model = train_xgboost_regressor(df_ohlcv)


=== Building Dataset ===
OHLCV rows: 1000
Final rows (post-feature): 1000

=== Training XGBoost for Next-Candle Return Prediction ===
Before dropna, rows: 1000
After dropna, rows: 949
Test MSE: 0.00000158
Test R^2: -0.114555
Feature Importance: {'frac_diff_returns': 208.0, 'entropy': 119.0, 'skew_30': 199.0, 'kurt_30': 160.0, 'hurst_50': 155.0, 'perm_entropy_10': 50.0, 'wavelet_energy': 126.0, 'lyapunov_50': 115.0, 'tsallis_30': 103.0}
