## Duplicated features

In [1]:
import pandas as pd

from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

from feature_engine.selection import DropDuplicateFeatures

In [2]:
# Toy dataset with redundant and constant features

X, y = make_classification(
    n_samples=1000,
    n_features=10,
    n_repeated=6,
    n_classes=2,
    random_state=10,
)

X = pd.DataFrame(X)
y = pd.Series(y)

X.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,-2.44068,0.668398,1.190618,-0.257954,-0.257954,1.190618,0.668398,-2.44068,0.668398,1.190618
1,0.552579,0.23333,-0.204964,0.17394,0.17394,-0.204964,0.23333,0.552579,0.23333,-0.204964
2,-1.382782,-0.983058,0.445872,-0.555166,-0.555166,0.445872,-0.983058,-1.382782,-0.983058,0.445872
3,-0.746352,-1.677396,0.048075,-0.644107,-0.644107,0.048075,-1.677396,-0.746352,-1.677396,0.048075
4,-0.5833,1.212712,0.461374,0.254628,0.254628,0.461374,1.212712,-0.5833,1.212712,0.461374


In [3]:
# separate dataset into train and test

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=0,
)

X_train.shape, X_test.shape

((700, 10), (300, 10))

In [4]:
# To remove constant features
sel = DropDuplicateFeatures()

# fit finds the duplicated features
sel.fit(X_train)  

In [5]:
# the duplicated features

sel.features_to_drop_

{4, 5, 6, 7, 8, 9}

In [6]:
# groups of duplicated features

sel.duplicated_feature_sets_

[{0, 7}, {1, 6, 8}, {2, 5, 9}, {3, 4}]

In [7]:
# drop duplicated features

X_train_t = sel.transform(X_train)
X_test_t = sel.transform(X_test)

X_train_t.shape, X_test_t.shape

((700, 4), (300, 4))

In [8]:
# the result is already a dataframe

X_train_t.head()

Unnamed: 0,0,1,2,3
105,-3.189671,-1.672739,1.128395,-1.101922
68,0.655279,-1.253133,-0.499965,-0.253241
479,1.719269,-2.113632,-1.114576,-0.311732
399,-1.157952,-1.531827,0.254379,-0.677742
434,2.744846,2.705456,-0.758431,1.328512
