In [1]:
import polars as pl
import pandas as pd
import numpy as np
from sklearn.datasets import make_classification
import dsds.fs as fs

In [2]:
orig_x, orig_y = make_classification(n_samples = 10_000, n_features = 500, n_informative = 60, n_redundant = 440)
# This is a Polars dataframe. This is dsds package's favored dataframe. dsds relies on Polars heavily.
# You must turn other dataframe formats into Polars for dsds to work.
df = pl.from_numpy(orig_x).insert_at_idx(0, pl.Series("target", orig_y)) 
# Turn it into Pandas.
df_pd = df.to_pandas()
target = "target"
features = df.columns
features.remove(target)

In [3]:
# Model Based Feature Importance.
# LGBM hyperparameters is automatically suggested by Optuna with 30-trial tuning.
# More user control on this tuning process will be added in the future.

f1 = fs.mrmr(
    df,
    "target",
    50,
    relevance="lgbm",
    mrmr_strategy="accum_corr"
)

INFO:dsds.fs:Running lgbm to determine feature relevance...
  from .autonotebook import tqdm as notebook_tqdm
[I 2023-10-21 00:20:14,110] A new study created in memory with name: no-name-1ecf01dd-02f3-4e09-bc11-6c2de5567e2a
[I 2023-10-21 00:20:20,643] Trial 2 finished with value: 0.24256363556586613 and parameters: {'max_depth': 2, 'num_iterations': 195, 'lambda_l1': 1.3600475714287871e-05, 'lambda_l2': 0.00032717347594624427, 'num_leaves': 139, 'feature_fraction': 0.904125420888508, 'bagging_fraction': 0.8592441425687889, 'bagging_freq': 4, 'min_child_samples': 34}. Best is trial 2 with value: 0.24256363556586613.
[I 2023-10-21 00:20:22,783] Trial 8 finished with value: 0.20988267740835415 and parameters: {'max_depth': 14, 'num_iterations': 51, 'lambda_l1': 5.979213539481126e-06, 'lambda_l2': 9.782081714286334e-05, 'num_leaves': 153, 'feature_fraction': 0.4298206152096491, 'bagging_fraction': 0.5193584586856116, 'bagging_freq': 7, 'min_child_samples': 82}. Best is trial 8 with value: 

Best params: {'max_depth': 9, 'num_iterations': 169, 'lambda_l1': 1.9963240803139397e-08, 'lambda_l2': 2.425838935769425e-06, 'num_leaves': 245, 'feature_fraction': 0.45655145286808796, 'bagging_fraction': 0.9979459248838979, 'bagging_freq': 3, 'min_child_samples': 41}.
Found at trial: 21.
Time took: 41s.


INFO:dsds.fs:Found 500 total features to select from. Proceeding to select top 50 features.
MRMR: 100%|██████████| 50/50 [00:01<00:00, 39.19it/s]


In [4]:
f2 = fs.mrmr(
    df,
    "target",
    50,
    relevance="f",
    mrmr_strategy= "accum_corr"
)

INFO:dsds.fs:Running f to determine feature relevance...
INFO:dsds.fs:Found 500 total features to select from. Proceeding to select top 50 features.
MRMR: 100%|██████████| 50/50 [00:01<00:00, 40.41it/s]


In [8]:
import dsds.metrics as me


me.jaccard_similarity(f1, f2)

0.2345679012345679