In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, median_absolute_error, r2_score
from scipy.stats import pearsonr, spearmanr
import polars as pl
import numpy as np
from tqdm import tqdm

def get_cols_inf(df: pl.DataFrame) -> list[str]:
    """
    Returns a list of column names that contain any positive or negative infinity.
    """
    cols = []
    for col in df.columns:
        # df[col] is a Series; .is_infinite() → Boolean Series; .any() → Python bool
        try:
            if df[col].is_infinite().any():
                cols.append(col)
        except Exception:
            # if the column isn’t numeric, .is_infinite() might error—just skip it
            continue
    return cols

def get_nan_columns(df: pl.DataFrame) -> list[str]:
    """
    Returns a list of column names with any NaN/null values.
    """
    cols = []
    for col in df.columns:
        if df.select(pl.col(col).is_null().any()).item():
            cols.append(col)
    return cols

def get_cols_zerostd(df: pl.DataFrame) -> list[str]:
    """
    Returns a list of column names whose standard deviation is zero
    (or whose std returns None because all values are null).
    Non-numeric columns (e.g. datetime) are skipped.
    """
    cols = []
    for col, dtype in zip(df.columns, df.dtypes):
        # Only attempt std() on numeric dtypes
        if dtype.is_numeric():  
            # df[col] is a Series; .std() returns a Python float or None
            std_val = df[col].std()
            if std_val == 0.0 or std_val is None:
                cols.append(col)
    return cols


def feature_engineering(df: pl.DataFrame) -> pl.DataFrame:
    # Feature engineering
    df = df.with_columns([
        # bidask_ratio = bid_qty / ask_qty
        (pl.col("bid_qty") / pl.col("ask_qty")).alias("bidask_ratio"),

        # buysell_ratio = 0 if volume == 0 else buy_qty / sell_qty
        pl.when(pl.col("volume") == 0)
        .then(0)
        .otherwise(pl.col("buy_qty") / pl.col("sell_qty"))
        .alias("buysell_ratio"),

        # bidask_delta = bid_qty - ask_qty
        (pl.col("bid_qty") - pl.col("ask_qty")).alias("bidask_delta"),

        # buysell_delta = buy_qty - sell_qty
        (pl.col("buy_qty") - pl.col("sell_qty")).alias("buysell_delta"),

        # buysell_size = buy_qty + sell_qty
        (pl.col("buy_qty") + pl.col("sell_qty")).alias("buysell_size"),

        # bidask_size = bid_qty + ask_qty
        (pl.col("bid_qty") + pl.col("ask_qty")).alias("bidask_size"),
    ])
    return df
def preprocess_train(train: pl.DataFrame, columns_to_drop: list[str] = []) -> pl.DataFrame:
    """
    Mirror of the original pandas workflow, but using polars.
    1. Identify columns with infinite, NaN, or zero‐std and drop them.
    2. Drop any user‐specified columns (e.g. label or order‐book columns).
    3. (You can add normalized/scaling steps here if needed.)
    """
    df = train.clone()

    df = feature_engineering(df)
    
    #### Preprocessing
    cols_inf = get_cols_inf(df)
    print("Columns with infinite values:", cols_inf)

    cols_nan = get_nan_columns(df)
    print("Columns with NaN values:", cols_nan)

    cols_zerostd = get_cols_zerostd(df)
    print("Columns with zero standard deviation:", cols_zerostd)
    # Drop columns with infinite, NaN, or zero‐std values
    drop_columns = list(set(cols_inf) | set(cols_nan) | set(cols_zerostd) | set(columns_to_drop))
    if drop_columns:
        df = df.drop(drop_columns)
    # df = df.sort("timestamp", descending=False)
    return df, drop_columns

def preprocess_test(test: pl.DataFrame, columns_to_drop: list[str] = []) -> pl.DataFrame:
    df = test.clone()
    df = feature_engineering(df)
    df = df.drop(columns_to_drop)
    print("Columns dropped from test set:", columns_to_drop)
    return df

# Data

In [None]:
data = pl.read_parquet(
    "/kaggle/input/drw-crypto-market-prediction/train.parquet"
)
# data = pl.read_parquet(
#     source = "./data/train.parquet",
# )

y = data["label"]
X, drop_columns = preprocess_train(
    data,
    columns_to_drop=["label", "bid_qty", "ask_qty", "buy_qty", "sell_qty"]
)
X

# Preprocessing

In [None]:
from sklearn.model_selection import train_test_split

# Define your date range as Python datetime objects:
from datetime import datetime
start = datetime(2023, 3, 1, 0, 0)
end   = datetime(2024, 2, 29, 23, 59, 59)

features = ['X35', 'X96', 'X113', 'X126', 'X261', 'X539', 'X666', 'X690', 'X696']

# === filter on the timestamp column ===
# Assume “ts” is the datetime column in X.
X_period = X.filter(
    (pl.col("timestamp") >= pl.lit(start)) & (pl.col("timestamp") <= pl.lit(end))
).select(["timestamp"] + features)

# Extract y_period likewise:
# In polars, “y” is still a column expression, so filter the original `data`:
y_period = data.filter(
    (pl.col("timestamp") >= pl.lit(start)) & (pl.col("timestamp") <= pl.lit(end))
)["label"]  # This returns a polars Series.

# If you need numpy for scikit‐learn, convert now:
X_np = X_period.drop(["timestamp"]).to_numpy()
y_np = y_period.to_numpy()

# Model Training

## LightGBM

In [None]:
# from lightgbm import LGBMRegressor

# def fit_lightgbm_regression(
#     X: np.ndarray,
#     y: np.ndarray,
#     n_estimators: int = 100,
#     learning_rate: float = 0.1,
#     num_leaves: int = 31,
#     **kwargs
# ) -> LGBMRegressor:
#     """
#     X and y must be NumPy arrays. Since polars DataFrames are not
#     directly accepted by scikit‐learn/lightgbm, we pass .to_numpy().
#     """
#     model = LGBMRegressor(
#         n_estimators=n_estimators,
#         learning_rate=learning_rate,
#         num_leaves=num_leaves,
#         **kwargs
#     )
#     model.fit(X, y)
#     return model

# model = fit_lightgbm_regression(
#     X_np,
#     y_np,
#     # 1. Learning rate: much lower than 0.3 to allow gradual fitting.
#     learning_rate=0.05,

#     # 2. Number of trees: increase so that η·T is roughly O(50–100) in practice.
#     #    Here, 1000 trees × 0.05 = 50 “effective steps” of gradient boosting.
#     n_estimators=1000,  

#     # 3. num_leaves: controls maximum number of terminal nodes per tree.
#     #    A rule of thumb is ~2^(max_depth). For dataset with moderate complexity,
#     #    num_leaves=64 (≈2^6) is common; if features are very noisy, reduce it.
#     num_leaves=64,      

#     # 4. max_depth: optional cap on tree depth—keeps each tree from growing too deep.
#     #    If you set max_depth=10, then num_leaves is effectively ≤ 2^10, but
#     #    most practitioners leave max_depth unset when they tune num_leaves directly.
#     max_depth=10,        

#     # 5. min_data_in_leaf (min_child_samples): ensures a leaf has enough observations.
#     #    E.g., if you have 100 k rows total, min_data_in_leaf=20 or 50 prevents overfitting.
#     min_data_in_leaf=20, 

#     # 6. subsample (a.k.a. bagging_fraction): to reduce variance, randomly sample rows.
#     #    0.8 means each tree sees 80 % of data. Coupled with subsample_freq=1 (every tree).
#     subsample=0.8,       
#     subsample_freq=1,    

#     # 7. colsample_bytree (a.k.a. feature_fraction): randomly sample 80 % of features per tree.
#     colsample_bytree=0.8,

#     # 8. Regularization: L1 or L2 to further guard against overfitting.
#     reg_alpha=0.1,   # L1 regularization
#     reg_lambda=1.0,  # L2 regularization

#     # 9. Other sensible defaults:
#     n_jobs=-1,
#     random_state=42,
#     verbosity=1,
# )

# model = fit_lightgbm_regression(
#     X_np,
#     y_np,
#     learning_rate=0.05,
#     n_estimators=1000,  
#     num_leaves=64,      
#     max_depth=10,        
#     min_data_in_leaf=20, 
#     subsample=0.8,       
#     subsample_freq=1,
#     colsample_bytree=0.8,
#     reg_alpha=0.1,   # L1 regularization
#     reg_lambda=1.0,  # L2 regularization

#     n_jobs=-1,
#     random_state=42,
#     verbosity=1,
    # device='gpu',             # enable CUDA
# )

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(
    n_estimators=10000,  # Number of trees
    max_depth=10,       # Maximum depth of each tree
    min_samples_split=5,  # Minimum samples required to split an internal node
    min_samples_leaf=2,   # Minimum samples required to be at a leaf node
    max_features='sqrt',   # Use square root of features for splitting
    random_state=42,       # For reproducibility
    n_jobs=-1              # Use all available cores
)
model.fit(X_np, y_np)

# Predict

In [None]:
test = pl.read_parquet("/kaggle/input/drw-crypto-market-prediction/test.parquet")
# test = pl.read_parquet(
#     source = "./data/test.parquet",
# )
test = test.with_row_index("ID", offset=1)
X_test = preprocess_test(test, columns_to_drop=drop_columns).drop(["ID"])
X_test = X_test.select(features)
X_test

In [None]:
y_pred = model.predict(X_test.to_numpy())
submission = pl.DataFrame({
    "ID": test["ID"],
    "prediction": y_pred
})
submission

# Submission

sample_submission.csv
- ID
- prediction

In [None]:
submission.write_csv("/kaggle/working/submission.csv")