## Constant features

In [1]:
import pandas as pd

from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

from sklearn.feature_selection import VarianceThreshold
from feature_engine.selection import DropConstantFeatures

In [2]:
# Toy dataset with redundant and constant features

X, y = make_classification(
    n_samples=1000,
    n_features=10,
    n_classes=2,
    random_state=10,
)

X = pd.DataFrame(X)
y = pd.Series(y)

# Add constant features
X[[0, 5, 9]] = 1

X.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,1,-0.376539,-0.62018,-0.157567,-1.120805,1,-1.574578,1.678046,1.08018,1
1,1,0.762409,-0.78421,-0.096479,-0.408758,1,0.210942,-0.850449,-0.461301,1
2,1,2.227934,0.547727,-0.341481,-0.817577,1,-2.663678,2.440042,1.698919,1
3,1,0.061129,-0.995868,-0.214351,-0.558957,1,-2.149167,2.294192,-1.383965,1
4,1,0.046349,0.834756,-0.104845,-0.455528,1,-0.911018,0.898098,1.068259,1


In [3]:
# separate dataset into train and test

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=0,
)

X_train.shape, X_test.shape

((700, 10), (300, 10))

## VarianceThreshold from Scikit-learn

Only works with numerical variables. Categorical variables need to be encoded first.

In [4]:
# To remove constant features
sel = VarianceThreshold(threshold=0)

# fit finds the features with zero variance
sel.fit(X_train)  

In [5]:
# get_support is a boolean vector flags 
# the features to keep

# Number of selected features (the non-constant)

sum(sel.get_support())

7

In [6]:
# the constant feautures

constant = X_train.columns[~sel.get_support()]

constant

Int64Index([0, 5, 9], dtype='int64')

In [7]:
# drop constant features

X_train_t = sel.transform(X_train)
X_test_t = sel.transform(X_test)

X_train_t.shape, X_test_t.shape

((700, 7), (300, 7))

In [8]:
# sklearn returns numpy arrays. Convert to dataframe

X_train_t = pd.DataFrame(X_train_t, columns=sel.get_feature_names_out())
X_test_t = pd.DataFrame(X_test_t, columns=sel.get_feature_names_out())

# show result
X_train_t.head()

Unnamed: 0,x1,x2,x3,x4,x6,x7,x8
0,0.039801,1.501392,-0.18924,1.546828,-1.831193,1.919634,0.209412
1,-0.078494,-1.536507,-0.496806,0.9651,-0.873804,-1.246872,0.629114
2,-0.731712,0.972453,-0.3093,-1.432922,-0.419046,-0.975984,0.377169
3,-0.121187,0.516685,-0.800862,-0.73617,-1.219396,-2.312341,-1.027631
4,-2.089187,0.899235,-0.241111,1.287536,0.643273,-2.310912,0.085618


## DropConstantFeatures from Feature-engine

Works with numerical and categorical variables.

In [9]:
# To remove constant features
sel = DropConstantFeatures(tol=1)

# fit finds the features with only 1 value
sel.fit(X_train)  

In [10]:
# the constant features

sel.features_to_drop_

[0, 5, 9]

In [11]:
# drop constant features

X_train_t = sel.transform(X_train)
X_test_t = sel.transform(X_test)

X_train_t.shape, X_test_t.shape

((700, 7), (300, 7))

In [12]:
# the result is already a dataframe

X_train_t.head()

Unnamed: 0,1,2,3,4,6,7,8
105,0.039801,1.501392,-0.18924,1.546828,-1.831193,1.919634,0.209412
68,-0.078494,-1.536507,-0.496806,0.9651,-0.873804,-1.246872,0.629114
479,-0.731712,0.972453,-0.3093,-1.432922,-0.419046,-0.975984,0.377169
399,-0.121187,0.516685,-0.800862,-0.73617,-1.219396,-2.312341,-1.027631
434,-2.089187,0.899235,-0.241111,1.287536,0.643273,-2.310912,0.085618


## Pandas .std()

Using the variable standard deviation. Only works with numerical variables.

In [13]:
# find constant features

constant_features = [
    feat for feat in X_train.columns if X_train[feat].std() == 0
]

len(constant_features)

3

In [14]:
# drop these columns from the train and test sets:

X_train_t = X_train.drop(labels=constant_features, axis=1)
X_test_t = X_test.drop(labels=constant_features, axis=1)

X_train_t.shape, X_test_t.shape

((700, 7), (300, 7))

## Pandas .nunique()

Using the number of unique values. Works with numerical and categorical variables.

In [15]:
# the nunique() method from pandas returns the number
# of different values in a variable.

constant_features = [
    feat for feat in X_train.columns if X_train[feat].nunique() == 1
]

len(constant_features)

3

In [16]:
# drop these columns from the train and test sets:

X_train_t = X_train.drop(labels=constant_features, axis=1)
X_test_t = X_test.drop(labels=constant_features, axis=1)

X_train_t.shape, X_test_t.shape

((700, 7), (300, 7))