In [1]:
import polars as pl
import pandas as pd
import numpy as np
from sklearn.datasets import make_classification
from time import perf_counter


In [2]:
orig_x, orig_y = make_classification(n_samples = 100_000, n_features = 10, n_informative = 5, n_redundant = 5)
# This is a Polars dataframe. This is dsds package's favored dataframe. dsds relies on Polars heavily.
# You must turn other dataframe formats into Polars for dsds to work.
df = pl.from_numpy(orig_x).insert_at_idx(0, pl.Series("target", orig_y)) 
# Turn it into Pandas.
df_pd = df.to_pandas()
target = "target"
features = df.columns
features.remove(target)

In [3]:
df.head()

target,column_0,column_1,column_2,column_3,column_4,column_5,column_6,column_7,column_8,column_9
i32,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
1,2.698327,-1.624455,0.847383,-0.606488,0.777444,-0.744581,-2.733768,-6.183972,-1.439884,-2.33967
0,-1.282648,0.931155,2.391945,1.834565,1.500815,-0.169737,-0.327692,0.310852,0.723299,0.481661
0,-1.092254,0.918623,4.913396,2.451027,1.261594,-2.431023,-3.055167,-1.670721,0.988926,0.389125
0,0.150306,-1.847282,-3.050031,-1.651987,-3.817491,1.337039,-0.848331,2.109579,-0.71082,-3.001302
0,-1.279539,0.99562,3.316524,2.573959,2.800209,0.507946,-0.594723,-0.502504,0.988759,-0.044307


## Comparisons

This notebook compares results and performance between the dsds package, sklearn and some other packages for feature selection and some other transformations common in the data science pipeline.

### Methods Compared:
1. Scaling and Imputation
2. Fscore
3. Mutual Information Score
4. MRMR feature selection strategies
5. Power Transform

You may restart the kernel after each section. But remember to rerun the cells above. If you are concerned about memory usage when running this notebook, go to the end and run the gc cell.

# Scaling and Imputation

In [4]:
import dsds.transform as t

In [None]:
features = df.columns
features.remove("target")

In [None]:
scaled = t.scale(df, cols=features, strategy="standard")
scaled.head()

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
# The difference in result is caused by using ddof = 1 for sample variance in dsds
# and using ddof = 0 in sklearn.

# Long and convoluted code just to do some scaling...
std = StandardScaler()
scaled2 = std.fit_transform(df_pd[features], df_pd[target])
# scaled2[:5, :] # scaled2 is a numpy matrix
scaled2 = pd.DataFrame(scaled2, columns=features)
scaled2[target] = df_pd[target]
scaled2.head()

In [None]:
%%timeit
scaled = t.scale(df, cols=features, strategy="standard")

In [None]:
%%timeit
std = StandardScaler()
scaled2 = std.fit_transform(df_pd[features], df_pd[target])
scaled2 = pd.DataFrame(scaled2, columns=features)
scaled2[target] = df_pd[target]


In [5]:
import dsds.transform as t
t.impute(df, cols=features, strategy="median").head(3) 

target,column_0,column_1,column_2,column_3,column_4,column_5,column_6,column_7,column_8,column_9
i32,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
1,2.698327,-1.624455,0.847383,-0.606488,0.777444,-0.744581,-2.733768,-6.183972,-1.439884,-2.33967
0,-1.282648,0.931155,2.391945,1.834565,1.500815,-0.169737,-0.327692,0.310852,0.723299,0.481661
0,-1.092254,0.918623,4.913396,2.451027,1.261594,-2.431023,-3.055167,-1.670721,0.988926,0.389125


In [6]:
from sklearn.impute import SimpleImputer 
imputer = SimpleImputer(strategy = "median")
imputed = pd.DataFrame(imputer.fit_transform(df_pd, df_pd[target]), columns=df.columns)
imputed.head(3)

Unnamed: 0,target,column_0,column_1,column_2,column_3,column_4,column_5,column_6,column_7,column_8,column_9
0,1.0,2.698327,-1.624455,0.847383,-0.606488,0.777444,-0.744581,-2.733768,-6.183972,-1.439884,-2.33967
1,0.0,-1.282648,0.931155,2.391945,1.834565,1.500815,-0.169737,-0.327692,0.310852,0.723299,0.481661
2,0.0,-1.092254,0.918623,4.913396,2.451027,1.261594,-2.431023,-3.055167,-1.670721,0.988926,0.389125


In [7]:
%%timeit
t.impute(df, cols=features, strategy="median")

1.2 ms ± 13.6 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


# Fscore

In [None]:
import dsds.fs as fs # fs = feature_selection
from sklearn.feature_selection import mutual_info_classif, f_classif, f_regression

In [None]:
fs._f_score(df, target=target, num_list = features)

In [None]:
# The more core, the bigger the difference. Data here is not big enough to show the difference
start = perf_counter()
res = fs.f_classif(df, target=target)
end = perf_counter()
print(f"Spent {end - start:.2f}s in computing Fscore.")
res.head(10)

In [None]:
start = perf_counter()
f, pv = f_classif(df_pd[features], df_pd[target])
res = pd.DataFrame({"feature":features, "f_value":f, "p_value":pv})
end = perf_counter()
print(f"Spent {end - start:.2f}s in computing Fscore.")
res.head(10)

In [None]:
start = perf_counter()
f, pv = f_regression(df_pd[features], df_pd[target])
res = pd.DataFrame({"feature":features, "f_value":f, "p_value":pv})
end = perf_counter()
print(f"Spent {end - start: .2f}s in computing Fscore.")
res.head(10)

# Mutual Information Score

In [None]:
# Vastly faster than sklearn. Finished in 0.7s in this run
fs.mutual_info(df, target=target, conti_cols=features).sort(by="estimated_mi", descending=True).limit(10)

In [None]:
def estimate_mi_sklearn(df:pd.DataFrame, cols:list[str], target:str, k=3, random_state:int=42):
    mi_estimates = mutual_info_classif(df[cols], df[target]
                        , n_neighbors=k, random_state=random_state, discrete_features=False)

    return pl.from_records([cols, mi_estimates], schema=["feature", "estimated_mi"]).sort("estimated_mi", descending=True)

In [None]:
# The reason sklearn's impl is slow is that it did not turn on multithreading for KDtrees.
# Sklearn also did not provide an option to turn it on, despite the fact that sklearn's KDtrees
# does have this functionality. Finished in 4.4s in this run
estimate_mi_sklearn(df_pd, cols=features, target=target).limit(10)

# MRMR Feature selection Strategy

In [None]:
from mrmr import mrmr_classif # This is currently the most starred MRMR Python package on github

In [None]:
# Need to wrap it so that we get apples to apples comparison
def mrmr_package(df:pd.DataFrame, target:str, k:int) -> list[str]:
    features = list(df.columns)
    features.remove(target)
    X = df[features]
    y = df[target]
    start = perf_counter()
    output = mrmr_classif(X, y, K = k)
    end = perf_counter()
    print(f"Spent {end - start:.2f}s to compute mrmr.")
    return output

In [None]:
mrmr_package(df_pd, "target", 50)

In [None]:
start = perf_counter()
res = fs.mrmr(df, target="target", k = 50, low_memory=False)
end = perf_counter()
print(f"Spent {end - start:.2f}s in computing.")
res

In [None]:
start = perf_counter()
res = fs.mrmr(df, target="target", k = 50, low_memory=True)
end = perf_counter()
print(f"Spent {end - start:.2f}s in computing.")
res

# Power Transform

In [None]:
# Eager transform.
start = perf_counter()
res_eager = t.power_transform(df, cols=features, strategy="yeo_johnson")
end = perf_counter()
print(f"Spent {end - start:.2f}s in computing.")
res_eager.head() 

In [None]:
from sklearn.preprocessing import power_transform

In [None]:
# Sklearn with Pandas

start = perf_counter()
transformed = power_transform(df_pd[features], method = "yeo-johnson", standardize=False)
end = perf_counter()
df_pd[features] = transformed
print(f"Spent {end - start:.2f}s in computing.")
df_pd.head()


# GC

In [None]:
import gc 
gc.collect()