In [53]:
from sklearn.datasets import make_classification
from sklearn.feature_selection import VarianceThreshold

n_samples = 1000
n_features = 20
n_informative = 5

n_redundant = 5
n_duplicates = 5

X, y = make_classification(n_samples=n_samples, n_features=n_features,
                            n_informative=n_informative, n_redundant=n_redundant,
                            n_repeated=n_duplicates, random_state=42,
                            shuffle=False, class_sep=2.0)

feature_names = ["feature_" + str(i) for i in range(X.shape[1])]

informative_indices = range(n_informative)
informative_names = ["informative_" + str(i) for i in informative_indices]
for i, j in zip(informative_indices, informative_names):
    feature_names[i] = j

redundant_indices = range(n_informative, n_informative+n_redundant)
redundant_names = ["redundant_" + str(i) for i in range(n_redundant)]
for i, j in zip(redundant_indices, redundant_names):
    feature_names[i] = j

duplicated_indices = range(n_informative+n_redundant, n_informative+n_redundant+n_duplicates)
duplicated_names = ["duplicated_" + str(i) for i in range(n_duplicates)]
for i, j in zip(duplicated_indices, duplicated_names):
    feature_names[i] = j

In [54]:
model = LogisticRegression()

rfe = RFE(model, n_features_to_select=5, verbose=1)

rfe.fit(X, y)

selected_features = [feature_names[i] for i in range(len(feature_names)) if rfe.support_[i]]
print(f"Selected features: {selected_features}")


Fitting estimator with 20 features.
Fitting estimator with 19 features.
Fitting estimator with 18 features.
Fitting estimator with 17 features.
Fitting estimator with 16 features.
Fitting estimator with 15 features.
Fitting estimator with 14 features.
Fitting estimator with 13 features.
Fitting estimator with 12 features.
Fitting estimator with 11 features.
Fitting estimator with 10 features.
Fitting estimator with 9 features.
Fitting estimator with 8 features.
Fitting estimator with 7 features.
Fitting estimator with 6 features.
Selected features: ['informative_0', 'informative_1', 'redundant_3', 'duplicated_1', 'feature_19']


In [55]:
from sklearn.datasets import make_classification
from sklearn.linear_model import Lasso

model = Lasso(alpha=0.1)

model.fit(X, y)

coef = model.coef_


selected_features = [feature_names[i] for i in range(len(feature_names)) if coef[i] != 0]
print(f"Selected features: {selected_features}")


Selected features: ['redundant_2', 'redundant_3']


In [56]:
from sklearn.datasets import make_classification
from sklearn.feature_selection import VarianceThreshold




selector = VarianceThreshold(threshold=0.1)

X_transformed = selector.fit_transform(X)

selected_indices = selector.variances_.argsort()[::-1][:5]

selected_features = [feature_names[i] for i in selected_indices]
print(f"Selected features: {selected_features}")


Selected features: ['redundant_2', 'duplicated_3', 'redundant_1', 'redundant_4', 'duplicated_0']


Żeby naprawdę znaleść większosć informative features potrzebujemu o wiele więcej od 1000 linij