In [1]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



In [2]:
import json

def save_results_to_json(results, out_path):
    """Save results to JSON file, appending if file exists"""
    
    if os.path.exists(out_path):
        try:
            # Load existing data
            with open(out_path, 'r') as f:
                existing_data = json.load(f)
            
            # Ensure existing_data is a list
            if not isinstance(existing_data, list):
                existing_data = [existing_data]
            
            # Append new results
            existing_data.extend(results)
            
        except json.JSONDecodeError:
            # If existing file is corrupted, start fresh
            print("Warning: Existing JSON file corrupted, starting fresh")
            existing_data = results
        
        # Save combined data
        with open(out_path, 'w') as f:
            json.dump(existing_data, f, indent=2)
    else:
        # Create new file
        with open(out_path, 'w') as f:
            json.dump(results, f, indent=2)

In [3]:
import os

os.getcwd()

'/Users/ryant/Documents/Github/kaggle-challenges/drw-cryptomarketprediction2025/kaggle'

In [4]:
os.getenv('user')

# DRW - Crypto Market Prediction

This notebook documents all the steps done in this project.

Timeline:
- 10/06/25: 0.05031
    - Reorganize notebooks.
    - Test training with GPU - Way faster than CPU.
    - Implement feature elimination using GPU.
    - Tested with Linear Models - will be extremely slow in iteration.
    - Develop feature engineering pipeline
- 14/06/25
    - redevelop feature engineering pipeline - pipe results into downloadable results

In [5]:
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, median_absolute_error, r2_score
from scipy.stats import pearsonr
import polars as pl
import numpy as np
from tqdm import tqdm
from datetime import datetime
from dotenv import load_dotenv
load_dotenv()

train_splits = {
    "full" : pl.datetime(2023, 3, 1, 0, 0, 0),
    "last_12m" : pl.datetime(2023, 6, 1, 0, 0, 0),
    "last_9m" : pl.datetime(2023, 9, 1, 0, 0, 0),
    "last_3m" : pl.datetime(2024, 2, 1, 0, 0, 0),
    "last_6m": pl.datetime(2024, 12, 1, 0, 0, 0),
}

PATHS = {
    "TRAIN_PATH" :"/kaggle/input/drw-crypto-market-prediction/train.parquet",
    "TEST_PATH" : "/kaggle/input/drw-crypto-market-prediction/test.parquet",
    "SUBMISSION_PATH" : "/kaggle/input/drw-crypto-market-prediction/sample_submission.csv",
}

# features = []

def load_data(TRAIN_PATH: str, TEST_PATH: str):
    if not os.getenv(key='user'):
        TRAIN_PATH = "." + TRAIN_PATH
        TEST_PATH = "." + TEST_PATH
    train_data = pl.read_parquet(TRAIN_PATH).sort("timestamp", descending = False)
    test_data = pl.read_parquet(TEST_PATH)
    # print(f"Train data shape: {train_data.shape}")
    # print(f"Test data shape: {test_data.shape}")
    return train_data, test_data

train_data, test_data = load_data(
    TRAIN_PATH = PATHS["TRAIN_PATH"],
    TEST_PATH = PATHS["TEST_PATH"],
)
print(train_data.tail(10))

shape: (10, 897)
┌─────────┬─────────┬─────────┬──────────┬───┬──────────┬──────────┬───────────┬──────────────┐
│ bid_qty ┆ ask_qty ┆ buy_qty ┆ sell_qty ┆ … ┆ X889     ┆ X890     ┆ label     ┆ timestamp    │
│ ---     ┆ ---     ┆ ---     ┆ ---      ┆   ┆ ---      ┆ ---      ┆ ---       ┆ ---          │
│ f64     ┆ f64     ┆ f64     ┆ f64      ┆   ┆ f64      ┆ f64      ┆ f64       ┆ datetime[ns] │
╞═════════╪═════════╪═════════╪══════════╪═══╪══════════╪══════════╪═══════════╪══════════════╡
│ 7.739   ┆ 2.904   ┆ 22.151  ┆ 53.889   ┆ … ┆ 0.137442 ┆ 0.244015 ┆ -0.166322 ┆ 2024-02-29   │
│         ┆         ┆         ┆          ┆   ┆          ┆          ┆           ┆ 23:50:00     │
│ 3.304   ┆ 6.292   ┆ 18.19   ┆ 104.161  ┆ … ┆ 0.137251 ┆ 0.243846 ┆ 0.02131   ┆ 2024-02-29   │
│         ┆         ┆         ┆          ┆   ┆          ┆          ┆           ┆ 23:51:00     │
│ 3.718   ┆ 3.63    ┆ 54.177  ┆ 77.472   ┆ … ┆ 0.137061 ┆ 0.243677 ┆ 0.096966  ┆ 2024-02-29   │
│         ┆         ┆  

# 1 Data

## 1.1 Pre-processing / Feature Engineering

**Pre-Processing**
1. inf/-inf columns: `['X697', 'X698', 'X699', 'X700', 'X701', 'X702', 'X703', 'X704', 'X705', 'X706', 'X707', 'X708', 'X709', 'X710', 'X711', 'X712', 'X713', 'X714', 'X715', 'X716', 'X717']`
2. columns with NaN values: `[]`
3. 0 std columns : `['X864', 'X867', 'X869', 'X870', 'X871', 'X872']`


**Feature Engineering**
1. `bidask_ratio`
2. `buysell_ratio`
3. `bidask_delta`
4. `buysell_delta`
5. `buysell_size`
6. `bidask_size`

In [6]:
def get_cols_inf(df: pl.DataFrame) -> list[str]:
    """
    Returns a list of column names that contain any positive or negative infinity.
    """
    cols = []
    for col in df.columns:
        # df[col] is a Series; .is_infinite() → Boolean Series; .any() → Python bool
        try:
            if df[col].is_infinite().any():
                cols.append(col)
        except Exception:
            # if the column isn’t numeric, .is_infinite() might error—just skip it
            continue
    return cols

def get_nan_columns(df: pl.DataFrame) -> list[str]:
    """
    Returns a list of column names with any NaN/null values.
    """
    cols = []
    for col in df.columns:
        if df.select(pl.col(col).is_null().any()).item():
            cols.append(col)
    return cols

def get_cols_zerostd(df: pl.DataFrame) -> list[str]:
    """
    Returns a list of column names whose standard deviation is zero
    (or whose std returns None because all values are null).
    Non-numeric columns (e.g. datetime) are skipped.
    """
    cols = []
    for col, dtype in zip(df.columns, df.dtypes):
        # Only attempt std() on numeric dtypes
        if dtype.is_numeric():  
            # df[col] is a Series; .std() returns a Python float or None
            std_val = df[col].std()
            if std_val == 0.0 or std_val is None:
                cols.append(col)
    return cols


def feature_engineering(df: pl.DataFrame) -> pl.DataFrame:
    # Feature engineering
    df = df.with_columns([
        # bidask_ratio = bid_qty / ask_qty
        (pl.col("bid_qty") / pl.col("ask_qty")).alias("bidask_ratio"),

        # buysell_ratio = 0 if volume == 0 else buy_qty / sell_qty
        pl.when(pl.col("volume") == 0)
        .then(0)
        .otherwise(pl.col("buy_qty") / pl.col("sell_qty"))
        .alias("buysell_ratio"),

        # bidask_delta = bid_qty - ask_qty
        (pl.col("bid_qty") - pl.col("ask_qty")).alias("bidask_delta"),

        # buysell_delta = buy_qty - sell_qty
        (pl.col("buy_qty") - pl.col("sell_qty")).alias("buysell_delta"),

        # buysell_size = buy_qty + sell_qty
        (pl.col("buy_qty") + pl.col("sell_qty")).alias("buysell_size"),

        # bidask_size = bid_qty + ask_qty
        (pl.col("bid_qty") + pl.col("ask_qty")).alias("bidask_size"),
    ])
    return df
def preprocess_train(train: pl.DataFrame, columns_to_drop: list[str] = []) -> pl.DataFrame:
    """
    Mirror of the original pandas workflow, but using polars.
    1. Identify columns with infinite, NaN, or zero‐std and drop them.
    2. Drop any user‐specified columns (e.g. label or order‐book columns).
    3. (You can add normalized/scaling steps here if needed.)
    """
    df = train.clone()

    df = feature_engineering(df)
    
    #### Preprocessing
    cols_inf = get_cols_inf(df)
    print("Columns with infinite values:", cols_inf)

    cols_nan = get_nan_columns(df)
    print("Columns with NaN values:", cols_nan)

    cols_zerostd = get_cols_zerostd(df)
    print("Columns with zero standard deviation:", cols_zerostd)
    # Drop columns with infinite, NaN, or zero‐std values
    drop_columns = list(set(cols_inf) | set(cols_nan) | set(cols_zerostd) | set(columns_to_drop))
    if drop_columns:
        df = df.drop(drop_columns)
    # df = df.sort("timestamp", descending=False)
    return df, drop_columns

def preprocess_test(test: pl.DataFrame, columns_to_drop: list[str] = []) -> pl.DataFrame:
    df = test.clone()
    df = feature_engineering(df)
    df = df.drop(columns_to_drop)
    print("Columns dropped from test set:", columns_to_drop)
    return df

In [7]:
y = train_data["label"]
X, drop_columns = preprocess_train(
    train_data,
    columns_to_drop=["label", "bid_qty", "ask_qty", "buy_qty", "sell_qty"]
)
print(X)

Columns with infinite values: ['X697', 'X698', 'X699', 'X700', 'X701', 'X702', 'X703', 'X704', 'X705', 'X706', 'X707', 'X708', 'X709', 'X710', 'X711', 'X712', 'X713', 'X714', 'X715', 'X716', 'X717']
Columns with NaN values: []
Columns with zero standard deviation: ['X864', 'X867', 'X869', 'X870', 'X871', 'X872']
shape: (525_887, 871)
┌─────────┬──────────┬───────────┬───────────┬───┬────────────┬────────────┬───────────┬───────────┐
│ volume  ┆ X1       ┆ X2        ┆ X3        ┆ … ┆ bidask_del ┆ buysell_de ┆ buysell_s ┆ bidask_si │
│ ---     ┆ ---      ┆ ---       ┆ ---       ┆   ┆ ta         ┆ lta        ┆ ize       ┆ ze        │
│ f64     ┆ f64      ┆ f64       ┆ f64       ┆   ┆ ---        ┆ ---        ┆ ---       ┆ ---       │
│         ┆          ┆           ┆           ┆   ┆ f64        ┆ f64        ┆ f64       ┆ f64       │
╞═════════╪══════════╪═══════════╪═══════════╪═══╪════════════╪════════════╪═══════════╪═══════════╡
│ 221.389 ┆ 0.121263 ┆ -0.41769  ┆ 0.005399  ┆ … ┆ 6.858  

# 1.2 Time Series Split Functions

1. `split_rollingwindow` - rolling window.
3. `split_overlapwindow` - overlapping.

In [8]:
def split_rollingwindow(X, n_splits=5, train_ratio=0.8):
    """
    Rolling window time series splitter with fixed train/test ratio and number of splits.
    
    Parameters
    ----------
    X : array-like or DataFrame
        Dataset with time-ordering preserved.
    n_splits : int
        Number of rolling splits.
    train_ratio : float
        Proportion of each window used for training (e.g. 0.8 for 80/20 split).
    
    Yields
    ------
    train_idx, test_idx : range, range
        Index ranges for training and testing.
    """
    n_obs = len(X)
    window_size = n_obs // (n_splits + 1)
    train_size = int(train_ratio * window_size)
    test_size = window_size - train_size
    
    for i in range(n_splits):
        start = i * window_size
        train_start = start
        train_end = train_start + train_size
        test_start = train_end
        test_end = test_start + test_size

        if test_end > n_obs:
            break

        yield range(train_start, train_end), range(test_start, test_end)

# splits = rolling_window_split(X=X, n_splits = 5, train_ratio=0.5)
# for train_idx, test_idx in splits:
#     print(train_idx, test_idx)
# #     df_train = X.slice(train_idx.start, len(train_idx))
# #     df_test = X.slice(test_idx.start, len(test_idx))

# for i, (train_idx, test_idx) in enumerate(splits):
#     if i == 1:  # second batch (index 1)
#         df_train = X.slice(train_idx.start, len(train_idx))
#         df_test = X.slice(test_idx.start, len(test_idx))
#         break

In [9]:
def split_overlapwindow(X, train_size, test_size, step):
    """
    Rolling window splitter with overlapping train/test splits.

    Parameters
    ----------
    X : array-like or DataFrame
        Dataset with time-ordering preserved.
    train_size : int
        Number of observations in each training window.
    test_size : int
        Number of observations in each test window.
    step : int
        Forward step size between each split.

    Yields
    ------
    train_idx, test_idx : range, range
        Index ranges for training and testing.
    """
    n_obs = len(X)
    start = 0
    while (start + train_size + test_size) <= n_obs:
        train_start = start
        train_end = train_start + train_size
        test_start = train_end
        test_end = test_start + test_size

        yield range(train_start, train_end), range(test_start, test_end)
        start += step

# train_size = 100000
# test_size = 100000
# step = 10000
# splits = split_overlapwindow(X, train_size, test_size, step)
# for train_idx, test_idx in splits:
#     print(train_idx, test_idx)

# 2 Iterative Feature Selection

## 2.1 Base Model

In [20]:
from xgboost import XGBRegressor

def instantiate_model():
    if not os.getenv(key='USER'):
        print("KAGGLE ENVIRONMENT DETECTED - GPU")
        model = XGBRegressor(
            tree_method = "hist", 
            device = "cuda",
            n_estimators=1000,
            max_depth=100,
            learning_rate=0.05,
            random_state=42,
            n_jobs=-1
        )
    else:
        print("LOCAL ENVIRONMENT - CPU")
        model = XGBRegressor(
            tree_method="hist",
            n_estimators=1000,
            max_depth=100,
            learning_rate=0.05,
            random_state=42,
            n_jobs=-1
        )
    return model

## 2.2 Iteration

In [25]:
from sklearn.base import clone
from sklearn.metrics import mean_squared_error
import uuid
# import cupy as cp  # You'll need cupy installed
if not os.getenv(key='USER'):
    import cupy as cp  # Ensure you have CuPy installed for GPU support
else:
    cp = None
import shap  # Ensure you have SHAP installed for feature importance

def iterative_featureselection(
    model,
    X: pl.DataFrame,
    y: pl.Series,
    split_fn,
    scorers: dict,
    drop_fraction: float = 0.1,
    min_features: int = 10,
    use_shap: bool = False,
    use_gpu : bool = True
) -> list:
    ID = uuid.uuid4().hex
    results = []
    splits = list(split_fn(X))
    n_splits = len(splits)

    current_X = X.clone()
    feature_cols = current_X.columns
    iteration = 0

    while len(feature_cols) > min_features:
        iteration += 1
        print(f"\n[+] Iteration {iteration} - {len(feature_cols)} features")

        # Extract feature matrix
        X_np = current_X.select(feature_cols).to_numpy()
        y_np = y.to_numpy().flatten()
        if use_gpu:
            X_np = cp.asarray(X_np)
            y_np = cp.asarray(y_np)
        
        print(f"Summary: {len(feature_cols)} features, X: {len(X_np)} y: {len(y_np)}")
        # Fit full model for feature importances
        training_model = clone(model)
        training_model.fit(X_np, y_np)

        # 1. Compute raw importances as a NumPy array
        if use_shap:
            explainer   = shap.TreeExplainer(training_model)
            shap_vals   = explainer.shap_values(X_np)          # shape (n_samples, n_features)
            imp_array   = np.abs(shap_vals).mean(axis=0)        # mean(|SHAP|) per feature
        else:
            imp_array   = np.array(training_model.feature_importances_)
        
        # 2. Build a Polars DataFrame of (feature, importance)
        importances_df = pl.DataFrame({
            "feature": feature_cols,
            "importance": imp_array.tolist()
        })
        
        # Cross-validation
        metrics = {k: [] for k in scorers}

        for i, (train_idx, test_idx) in enumerate(splits, start=1):
            print(f"    [Fold {i}/{n_splits}] ...", end="\r")

            X_train, y_train = X_np[train_idx], y_np[train_idx]
            X_test, y_test = X_np[test_idx], y_np[test_idx]

            fold_model = clone(model)
            fold_model.fit(X_train, y_train)
            y_pred = fold_model.predict(X_test)

            for name, func in scorers.items():
                y_test = y_test.get() if hasattr(y_test, 'get') else y_test
                y_pred = y_pred.get() if hasattr(y_pred, 'get') else y_pred
                metrics[name].append(func(y_test, y_pred))

        # Mean metric values
        mean_metrics = {name: np.mean(vals) for name, vals in metrics.items()}

        results.append({
            "ID": ID,
            "params": {
                "drop_fraction" : drop_fraction,
                "min_features": min_features,
                "use_shap" : use_shap,
                "use_gpu" : use_gpu
            },
            "num_features": len(feature_cols),
            "features": feature_cols.copy(),
            **{f"scores_{name}_mean": val for name, val in mean_metrics.items()}
        })

        print("    → " + " | ".join([f"{k.upper()}: {v:.4f}" for k, v in mean_metrics.items()]))

        # 3. Identify the n_drop least important features
        n_drop    = max(10, int(len(feature_cols) * drop_fraction))
        drop_cols = (
            importances_df
            .sort("importance")        # ascending
            .head(n_drop)              # take the smallest n_drop
            .get_column("feature")     # extract the feature column
            .to_list()                 # into a Python list for filtering
        )
        
        # 4. Filter out dropped features in your Polars DataFrame
        feature_cols = [f for f in feature_cols if f not in drop_cols]
        current_X    = current_X.select(feature_cols)

    print(f"""\n {"["+ "*" * 10 + "]"} Finished: {len(feature_cols)} features remaining (≤ min_features={min_features}) {"["+ "*" * 10 + "]"}""")
    return results
    
def pearson_corr(y_true, y_pred):
    return pearsonr(y_true, y_pred)[0]

scorers = {
    "pearson": pearson_corr,
    "mse": mean_squared_error,
    "mae": mean_absolute_error
}

# Test
# results = iterative_featureselection(
#     model=model,
#     X=X,
#     y=y,
#     split_fn=lambda X:split_rollingwindow(X=X, n_splits = 2, train_ratio=0.5),
#     scorers = scorers,
#     drop_fraction=0.9,
#     min_features=100,
#     use_shap = False
# )

In [None]:
# print("Training for Iteration 1 (shap=True)...")
# iter_1 = iterative_featureselection(
#     model=model,
#     X=X,
#     y=y,
#     split_fn=lambda X:split_rollingwindow(X=X, n_splits = 10, train_ratio=0.5),
#     # split_fn=lambda X:split_overlapwindow(X, 50000, 50000, 25000),
#     scorers = scorers,
#     drop_fraction=0.1,
#     min_features=10,
#     use_shap = True
# )
# out_path = "/kaggle/working/iterative_featuresselection.json"
# save_results_to_json(iter_1, out_path)

# print("Training for Iteration 2 (shap = False)...")
# iter_2 = iterative_featureselection(
#     model=model,
#     X=X,
#     y=y,
#     # split_fn=lambda X:split_overlapwindow(X, 50000, 50000, 25000),
#     split_fn=lambda X:split_rollingwindow(X=X, n_splits = 10, train_ratio=0.5),
#     scorers = scorers,
#     drop_fraction=0.1,
#     min_features=10,
#     use_shap = False
# )
# out_path = "/kaggle/working/iterative_featuresselection.json"
# save_results_to_json(iter_2, out_path

In [26]:
def run_feature_selection_experiments(data, experiments, splits, scorers, output_dir="/kaggle/working"):
    """
    Feature Selection Experiments
    
    Parameters:
    - experiments
    - splits : timeframe splits
    - scorers : model scorers
    
    """
    # Process data - split based on time.
    y = data.select([
        pl.col("timestamp").alias("timestamp"),
        pl.col("label").alias("label")
    ])
    X, drop_columns = preprocess_train(
        data,
        columns_to_drop=["label", "bid_qty", "ask_qty", "buy_qty", "sell_qty"]
    )
    data_store = {}
    for s_name, s_time in splits.items():
        data_store[s_name] = {
            "X": X.filter(pl.col("timestamp") >= s_time),
            "y": y.filter(pl.col("timestamp") >= s_time).drop("timestamp")
        }
    
    all_results = []

    for config in experiments:
        print(f"\n{'='*50}")
        print(f"Running experiment: ")
        print(f"{'='*50}\n")

        for s_name in splits.keys():
            try:
                print(data_store[s_name]["X"].shape)
                print(data_store[s_name]["y"].shape)
                results = iterative_featureselection(
                    model=model,
                    X=data_store[s_name]["X"],
                    y=data_store[s_name]["y"],
                    split_fn=config['split_fn'],
                    scorers=scorers,
                    drop_fraction=0.9,
                    min_features=200,
                    use_shap=config['use_shap'],
                    use_gpu=config.get('use_gpu', False)
                )
                
                results_df = pl.DataFrame(results).with_columns([
                    pl.lit(config['name']).alias("experiment_name"),
                    pl.lit(config['use_shap']).alias("use_shap"),
                    pl.lit(s_name).alias("split_name"),
                ])

                return results_df
                results_df.to_pandas().to_excel(
                    f"{output_dir}/feature_selection_{config['name']}_{s_name}.xlsx",
                    index=False
                )
                # # Save results for this configuration
                # out_path = f"{output_dir}/feature_selection_{config['desc']}.json"
                # save_results_to_json(results, out_path)
                # print(f"Saved results to {out_path}")
                
                # all_results.append({
                #     "config": config,
                #     "results": results,
                #     "output_path": out_path
                # })
                
            except Exception as e:
                print(f"Error in experiment {config['desc']}: {str(e)}")
                continue
        
    return all_results

model = instantiate_model()

# Define the experiment configurations
experiments = [
    {
        "name": "ex1",
        "desc" : "Test Experiment",
        "model" : clone(model),
        "split_fn" : lambda X: split_rollingwindow(X=X, n_splits=5, train_ratio=0.5),
        "use_shap" : True,
        "use_gpu" : False,
    }
]

data = run_feature_selection_experiments(
    data=train_data,
    experiments=experiments,
    splits=train_splits,
    scorers=scorers
)

LOCAL ENVIRONMENT - CPU
Columns with infinite values: ['X697', 'X698', 'X699', 'X700', 'X701', 'X702', 'X703', 'X704', 'X705', 'X706', 'X707', 'X708', 'X709', 'X710', 'X711', 'X712', 'X713', 'X714', 'X715', 'X716', 'X717']
Columns with NaN values: []
Columns with zero standard deviation: ['X864', 'X867', 'X869', 'X870', 'X871', 'X872']

Running experiment: 

(525887, 871)
(525887, 1)

[+] Iteration 1 - 871 features
Summary: 871 features, X: 525887 y: 525887


: 

In [None]:
# # Data 4
# start = pd.Timestamp('2023-12-01 00:00:00')
# end = pd.Timestamp('2024-02-29 23:59:00')

# # Data 4
# # start = pd.Timestamp('2024-02-01 00:00:00')
# # end = pd.Timestamp('2024-02-29 23:59:00')

# X_period = X[(X.index >= start) & (X.index <= end)]
# y_period = y[(X.index >= start) & (X.index <= end)]
# X_period.shape

In [None]:
# print("Training for Iteration 3 (shap = False) / Smaller Window...")
# iter_2 = iterative_featureselection(
#     model=model,
#     X=X,
#     y=y,
#     # split_fn=lambda X:split_overlapwindow(X, 50000, 50000, 25000),
#     split_fn=lambda X:split_rollingwindow(X=X, n_splits = 1, train_ratio=0.5),
#     scorers = scorers,
#     drop_fraction=0.1,
#     min_features=10,
#     use_shap = False
# )
# out_path = "/kaggle/working/iterative_featuresselection.json"
# save_results_to_json(iter_2, out_path)

# Current Results

In [None]:
with open("/kaggle/working/iterative_featuresselection.json", "r") as f:
    current_results = json.load(f)

current_results_df = pl.DataFrame(current_results).sort('scores_pearson_mean', descending = True)
current_results_df

In [None]:
current_results_df.to_excel("iterative_featureselection.xlsx", index = False)

In [4]:
import os

os.environ.get("USER")

'ryant'