In [1]:
from sklearn.model_selection import KFold, train_test_split
from sklearn.linear_model import Ridge
from scipy.stats import pearsonr as pr
from sklearn.base import clone
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import warnings 
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

warnings.filterwarnings("ignore")

In [2]:
class CFG:
    train_path = "/kaggle/input/drw-crypto-market-prediction/train.parquet"
    test_path = "/kaggle/input/drw-crypto-market-prediction/test.parquet"
    sample_sub_path = "/kaggle/input/drw-crypto-market-prediction/sample_submission.csv"
    target = "label"

In [3]:
train = pd.read_parquet(CFG.train_path).reset_index(drop=True)
test = pd.read_parquet(CFG.test_path).reset_index(drop=True)

In [4]:
train.isna().sum()

bid_qty     0
ask_qty     0
buy_qty     0
sell_qty    0
volume      0
           ..
X777        0
X778        0
X779        0
X780        0
label       0
Length: 786, dtype: int64

In [5]:
def data_processing(data):
    X = data.drop(CFG.target, axis=1)
    y = data[CFG.target]

    return X,y

In [6]:
train_X, train_y = data_processing(train)
test_X, test_y = data_processing(test)

In [7]:
# Стандартизируем данные
scaler = StandardScaler()
# Применяем стандартное масштабирование к обучающим данным
train_X = scaler.fit_transform(train_X)
test_X = scaler.transform(test_X)

In [8]:
# Инициализируем PCA
pca = PCA(n_components=30)

# Обучаем PCA на тренировочных данных
train_X = pca.fit_transform(train_X)


In [9]:
test_X = pca.transform(test_X)

In [10]:
# features: {feature_name: feature}
features = {}

for feature_idx in range(train_X.shape[1]):
    sub_features = {
        # f'{feature_idx} + _ln' : np.log(train_X[:,feature_idx]),
        f'{feature_idx} + _x^2' : np.power(train_X[:,feature_idx],2),
        f'{feature_idx} + _x^3' : np.power(train_X[:,feature_idx],3)
    }
    for name, feature in sub_features.items():
        features[name] = feature
    
for feature1_idx in range(train_X.shape[1] - 1):
    for feature2_idx in range(feature1_idx + 1, train_X.shape[1]):
        name = f'{feature1_idx} * {feature2_idx}'
        feature = train_X[:,feature1_idx] * train_X[:,feature2_idx]
        features[name] = feature

matrix_for_feature_selection = np.zeros((len(train_X),len(features)))

for y, feature in enumerate(features.values()):
    matrix_for_feature_selection[:,y] = feature

In [11]:
%load_ext cuml.accel

[2025-07-20 10:59:20.273] [CUML] [info] cuML: Installed accelerator for sklearn.


2025-07-20 10:59:27.015003: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1753009167.214333     135 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1753009167.273036     135 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


[2025-07-20 10:59:36.988] [CUML] [info] cuML: Installed accelerator for umap.
[2025-07-20 10:59:37.028] [CUML] [info] cuML: Installed accelerator for hdbscan.
[2025-07-20 10:59:37.028] [CUML] [info] cuML: Successfully initialized accelerator.


In [12]:
%pip install --upgrade cuml 

Collecting cuml
  Using cached cuml-0.6.1.post1.tar.gz (1.1 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: cuml
  Building wheel for cuml (setup.py) ... [?25l  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py bdist_wheel[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m See above for output.
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
error
[31m  ERROR: Failed building wheel for cuml[0m[31m
[0m[?25h  Running setup.py clean for cuml
Failed to build cuml
[31mERROR: ERROR: Failed to build installable wheels for some pyproject.toml based projects (cuml)[0m[31m
[0mNote: you may need to restart the kernel to use updated packages.


In [None]:
from sklearn.feature_selection import mutual_info_classif

# Преобразование данных в формат GPU (cuDF или CuPy)
import cupy as cp
X_gpu = cp.asarray(train_X)  # Ваши данные
y_gpu = cp.asarray(train_y)   # Целевые метки

# Вычисление mutual information (автоматически на GPU)
mi_scores = mutual_info_classif(X_gpu, y_gpu, n_neighbors=3)
print("Mutual Information Scores:", mi_scores)

ModuleNotFoundError: No module named 'cuml.feature_selection'