### 2.1 Constant feature
* Constant features have the same value in all the observations

### `Using sklearn`

In [1]:
import numpy as np
import pandas as pd

# Sklearn
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import VarianceThreshold

# Feature engine
from feature_engine.selection import DropConstantFeatures

In [5]:
X, y = make_classification(
    n_samples=1000,
    n_features=10,
    n_classes=2,
    random_state=10
)

# convert_array to dataframe and series
X = pd.DataFrame(X)
y = pd.Series(y)

# Add constant features
X[[0, 5, 9]] = 1
X.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,1,-0.376539,-0.62018,-0.157567,-1.120805,1,-1.574578,1.678046,1.08018,1
1,1,0.762409,-0.78421,-0.096479,-0.408758,1,0.210942,-0.850449,-0.461301,1
2,1,2.227934,0.547727,-0.341481,-0.817577,1,-2.663678,2.440042,1.698919,1
3,1,0.061129,-0.995868,-0.214351,-0.558957,1,-2.149167,2.294192,-1.383965,1
4,1,0.046349,0.834756,-0.104845,-0.455528,1,-0.911018,0.898098,1.068259,1


In [7]:
# Data splitting
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size = 0.3,
                                                    random_state=0)

In [12]:
sel = VarianceThreshold(threshold=0)
sel.fit(X_train)

# Features to retain
sum(sel.get_support())

7

In [10]:
# Features to remove
X_train.columns[~sel.get_support()]

Int64Index([0, 5, 9], dtype='int64')

In [14]:
# Remove constant features
X_train_t = sel.transform(X_train)
X_test_t = sel.transform(X_test)
print(X_train_t.shape, X_test_t.shape)

(700, 7) (300, 7)


In [15]:
# Transform the output to data frame
X_train_t = pd.DataFrame(X_train_t, columns = sel.get_feature_names_out())
X_test_t = pd.DataFrame(X_test_t, columns = sel.get_feature_names_out())

X_train_t.head()

Unnamed: 0,x1,x2,x3,x4,x6,x7,x8
0,0.039801,1.501392,-0.18924,1.546828,-1.831193,1.919634,0.209412
1,-0.078494,-1.536507,-0.496806,0.9651,-0.873804,-1.246872,0.629114
2,-0.731712,0.972453,-0.3093,-1.432922,-0.419046,-0.975984,0.377169
3,-0.121187,0.516685,-0.800862,-0.73617,-1.219396,-2.312341,-1.027631
4,-2.089187,0.899235,-0.241111,1.287536,0.643273,-2.310912,0.085618


### `Using feature engine`

In [17]:
sel = DropConstantFeatures(tol=1)
sel.fit(X_train)
sel.features_to_drop_

[0, 5, 9]

In [18]:
# Remove constant features
X_train_t = sel.transform(X_train)
X_test_t = sel.transform(X_test)
print(X_train_t.shape, X_test_t.shape)

(700, 7) (300, 7)


### `Using Pandas`

In [19]:
constant_features = [feat for feat in X_train.columns if X_train[feat].std() == 0]
constant_features

[0, 5, 9]

In [20]:
constant_features = [feat for feat in X_train.columns if X_train[feat].nunique() == 1]
constant_features

[0, 5, 9]

In [24]:
# Remove features
X_train_t = X_train.drop(labels = constant_features, axis = 1)
X_test_t = X_test.drop(labels = constant_features, axis = 1)
X_train_t

Unnamed: 0,1,2,3,4,6,7,8
105,0.039801,1.501392,-0.189240,1.546828,-1.831193,1.919634,0.209412
68,-0.078494,-1.536507,-0.496806,0.965100,-0.873804,-1.246872,0.629114
479,-0.731712,0.972453,-0.309300,-1.432922,-0.419046,-0.975984,0.377169
399,-0.121187,0.516685,-0.800862,-0.736170,-1.219396,-2.312341,-1.027631
434,-2.089187,0.899235,-0.241111,1.287536,0.643273,-2.310912,0.085618
...,...,...,...,...,...,...,...
835,-1.468374,0.848175,-0.071533,-0.013723,-0.849595,0.977174,-0.505686
192,-0.027732,0.963969,-0.185423,-0.836643,-0.677983,0.096945,-1.014776
629,1.300085,0.399210,-0.092703,0.754389,0.762208,-1.711360,-0.858352
559,-1.662055,0.489118,-0.316367,-0.958943,-2.220045,1.864672,-1.041008


## 2. Quasi-constant features

In [2]:
X, y = make_classification(
    n_samples=1000,
    n_features=10,
    n_classes=2,
    random_state=10
)

# convert_array to dataframe and series
X = pd.DataFrame(X)
y = pd.Series(y)

In [10]:
# Add quasi-constant features

import random

X.iloc[random.sample(range(0, 1000), 990), [5,7,9]] = 1
X[5].value_counts()

 1.000000    990
 0.575817      1
-0.252115      1
-0.713968      1
-0.566143      1
 0.417586      1
 1.374848      1
 1.052890      1
 0.307155      1
 0.205710      1
 0.814039      1
Name: 5, dtype: int64

In [11]:
# Data splitting
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size = 0.3,
                                                    random_state=0)

In [12]:
sel = VarianceThreshold(threshold=0.2)
sel.fit(X_train)

# Features to retain
sum(sel.get_support())

7

In [13]:
# Features to remove
X_train.columns[~sel.get_support()]

Int64Index([5, 7, 9], dtype='int64')

In [14]:
# Remove constant features
X_train_t = sel.transform(X_train)
X_test_t = sel.transform(X_test)
print(X_train_t.shape, X_test_t.shape)

(700, 7) (300, 7)


In [15]:
# Transform the output to data frame
X_train_t = pd.DataFrame(X_train_t, columns = sel.get_feature_names_out())
X_test_t = pd.DataFrame(X_test_t, columns = sel.get_feature_names_out())

X_train_t.head()

Unnamed: 0,x0,x1,x2,x3,x4,x6,x8
0,-1.155673,0.039801,1.501392,-0.18924,1.546828,-1.831193,0.209412
1,0.404169,-0.078494,-1.536507,-0.496806,0.9651,-0.873804,0.629114
2,0.36085,-0.731712,0.972453,-0.3093,-1.432922,-0.419046,0.377169
3,0.816893,-0.121187,0.516685,-0.800862,-0.73617,-1.219396,-1.027631
4,1.129063,-2.089187,0.899235,-0.241111,1.287536,0.643273,0.085618


### Using `feature engine`

In [16]:
sel = DropConstantFeatures(tol=0.95)
sel.fit(X_train)
sel.features_to_drop_

[5, 7, 9]

In [17]:
X_train_t = sel.transform(X_train)
X_test_t = sel.transform(X_test)

### Using `Pandas`

In [20]:
constant_features = [feat for feat in X_train.columns if X_train[feat].std() <= 0.2]
constant_features

[5, 7, 9]

In [21]:
quasi_constant_feat = []

for feature in X_train.columns:
    predominant = X_train[feature].value_counts(normalize = True).sort_values(ascending = False).values[0]
    if predominant > 0.95:
        quasi_constant_feat.append(feature)

In [22]:
quasi_constant_feat

[5, 7, 9]

In [23]:
# Remove features
X_train_t = X_train.drop(labels = quasi_constant_feat, axis = 1)
X_test_t = X_test.drop(labels = quasi_constant_feat, axis = 1)
X_train_t

Unnamed: 0,0,1,2,3,4,6,8
105,-1.155673,0.039801,1.501392,-0.189240,1.546828,-1.831193,0.209412
68,0.404169,-0.078494,-1.536507,-0.496806,0.965100,-0.873804,0.629114
479,0.360850,-0.731712,0.972453,-0.309300,-1.432922,-0.419046,0.377169
399,0.816893,-0.121187,0.516685,-0.800862,-0.736170,-1.219396,-1.027631
434,1.129063,-2.089187,0.899235,-0.241111,1.287536,0.643273,0.085618
...,...,...,...,...,...,...,...
835,-0.574422,-1.468374,0.848175,-0.071533,-0.013723,-0.849595,-0.505686
192,-0.156688,-0.027732,0.963969,-0.185423,-0.836643,-0.677983,-1.014776
629,0.884134,1.300085,0.399210,-0.092703,0.754389,0.762208,-0.858352
559,-1.196690,-1.662055,0.489118,-0.316367,-0.958943,-2.220045,-1.041008


## 3. Duplicate Features